In [1]:
import os
import sys

from pathlib import Path

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics

import warnings

warnings.filterwarnings("ignore")



In [2]:
movielens_loader = MovieLensLoader(
    num_users=1000,
    num_test_items=5,
)

In [3]:
movielens_dataset = movielens_loader.load()

In [4]:
train = movielens_dataset.train
test = movielens_dataset.test
rank_test = movielens_dataset.test_user2item

In [5]:
train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,zip_code,city,age_group,...,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western,rating_order
0,196,242,3,1997-12-04 15:55:49,49,M,writer,55105,Saint Paul,Midlife,...,0,0,0,0,0,0,0,0,0,37.0
1,186,302,3,1998-04-04 19:22:22,39,F,executive,0,unknown,Midlife,...,1,0,0,1,0,0,1,0,0,19.0
2,22,377,1,1997-11-07 07:18:36,25,M,writer,40206,Louisville,Young,...,0,0,0,0,0,0,0,0,0,76.0
3,244,51,2,1997-11-27 05:02:03,28,M,technician,80525,Fort Collins,Young,...,0,0,0,0,1,0,0,1,1,61.0
4,166,346,1,1998-02-02 05:33:16,47,M,educator,55113,Saint Paul,Midlife,...,0,0,0,0,0,0,0,0,0,13.0


In [6]:
user_movie_matrix = train.pivot(index="user_id", columns="movie_id", values="rating")
user_id2index = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
movie_id2index = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))
user_movie_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [14]:
train.columns

Index(['user_id', 'movie_id', 'rating', 'timestamp', 'age', 'gender',
       'occupation', 'zip_code', 'city', 'age_group', 'movie_title',
       'release_date', 'unknown', 'action', 'adventure', 'animation',
       'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
       'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi',
       'thriller', 'war', 'western', 'rating_order'],
      dtype='object')

In [7]:
train_keys = train[["user_id", "movie_id"]]
train_y = train.rating.values

test_keys = test[["user_id", "movie_id"]]
train_all_keys = user_movie_matrix.stack(dropna=False).reset_index()[["user_id", "movie_id"]]

In [8]:
train_x = train_keys.copy()
test_x = test_keys.copy()
train_all_x = train_all_keys.copy()

In [10]:
aggregators = ["min", "max", "mean"]

user_features = train.groupby("user_id").rating.agg(aggregators).to_dict()
movie_features = train.groupby("movie_id").rating.agg(aggregators).to_dict()

for agg in aggregators:
    train_x[f"u_{agg}"] = train_x["user_id"].map(user_features[agg])
    test_x[f"u_{agg}"] = test_x["user_id"].map(user_features[agg])
    train_all_x[f"u_{agg}"] = train_all_x["user_id"].map(user_features[agg])
    train_x[f"m_{agg}"] = train_x["movie_id"].map(movie_features[agg])
    test_x[f"m_{agg}"] = test_x["movie_id"].map(movie_features[agg])
    train_all_x[f"m_{agg}"] = train_all_x["movie_id"].map(movie_features[agg])
# 테스트용 데이터에만 존재하는 사용자나 영화의 특징량을, 학습용 데이터 전체의 평균 평갓값으로 채운다
average_rating = train_y.mean()
test_x.fillna(average_rating, inplace=True)

In [25]:
for agg in aggregators:
    train_x[f"u_{agg}"] = train_x["user_id"].map(user_features[agg])
    test_x[f"u_{agg}"] = test_x["user_id"].map(user_features[agg])
    train_all_x[f"u_{agg}"] = train_all_x["user_id"].map(user_features[agg])
    train_x[f"m_{agg}"] = train_x["movie_id"].map(movie_features[agg])
    test_x[f"m_{agg}"] = test_x["movie_id"].map(movie_features[agg])
    train_all_x[f"m_{agg}"] = train_all_x["movie_id"].map(movie_features[agg])
# 테스트용 데이터에만 존재하는 사용자나 영화의 특징량을, 학습용 데이터 전체의 평균 평갓값으로 채운다
average_rating = train_y.mean()
test_x.fillna(average_rating, inplace=True)

In [19]:
feature_list = ['age', 'gender', 'occupation', 'city', 'age_group',  'unknown', 'action', 'adventure', 'animation','childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western']

In [23]:
train_x = pd.concat([train_x, train[feature_list]], axis=1)
test_x = pd.concat([test_x, test[feature_list]], axis=1)

In [31]:
train_all_x = train_all_x.merge(movielens_dataset.item_content, on="movie_id")
train_all_x.columns

Index(['user_id', 'movie_id', 'u_min', 'm_min', 'u_max', 'm_max', 'u_mean',
       'm_mean', 'movie_title', 'release_date', 'unknown', 'action',
       'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary',
       'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery',
       'romance', 'sci_fi', 'thriller', 'war', 'western'],
      dtype='object')

In [32]:
# 특징량으로 사용하지 않는 정보는 삭제한다
train_x = train_x.drop(columns=["user_id", "movie_id", "age"])
test_x = test_x.drop(columns=["user_id", "movie_id", "age"])
train_all_x = train_all_x.drop(columns=["user_id", "movie_id", "movie_title", "release_date"])

In [38]:

cat_list = ["city", "age_group", "occupation", "gender"]

In [44]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
encoded = ohe.fit_transform(train_x[cat_list])

In [46]:
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out())

In [63]:
x_train = pd.concat([train_x.drop(columns=cat_list).reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
x_train.head()

Unnamed: 0,u_min,m_min,u_max,m_max,u_mean,m_mean,age,unknown,action,adventure,...,occupation_other,occupation_programmer,occupation_retired,occupation_salesman,occupation_scientist,occupation_student,occupation_technician,occupation_writer,gender_F,gender_M
0,2,1,5,5,3.676471,4.056075,49,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,1,1,5,5,3.367816,4.154386,39,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1,5,4,3.333333,2.153846,25,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,1,1,5,5,3.665236,3.461538,28,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,1,1,5,5,3.466667,3.643478,47,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [64]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, train_y)

In [68]:
# 테스트용 데이터 안의 사용와 영화의 조합에 대해 평갓값을 예측한다
encoded_ts = ohe.transform(test_x[cat_list])
encoded_ts_df = pd.DataFrame(encoded_ts, columns=ohe.get_feature_names_out())
x_test = pd.concat([test_x.drop(columns=cat_list).reset_index(drop=True), encoded_ts_df.reset_index(drop=True)], axis=1)

pred_ratings = model.predict(x_test.values)

movie_rating_predict = test_keys.copy()
movie_rating_predict["rating_pred"] = pred_ratings

In [69]:
print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))


Test MAE rating 0.8318640408018987
Test MSE rating 1.1171568477366345
Test RMSE rating 1.05695640768039
