In [2]:
import os
import sys
from typing import List, Dict, Tuple
from pathlib import Path

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")



In [3]:
movielens_loader = MovieLensLoader(
    num_users=100,
    num_test_items=5,
)

In [4]:
movielens_dataset = movielens_loader.load()

In [5]:
train = movielens_dataset.train
test = movielens_dataset.test
rank_test = movielens_dataset.test_user2item

In [6]:
user_movie_matrix = train.pivot(index="user_id", columns="movie_id", values="rating")
user_movie_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1225,1226,1227,1228,1229,1231,1233,1234,1235,1238
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,5.0,,,,,,,5.0,,,...,,,,,,,,,,
97,4.0,,,,,,5.0,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,
99,4.0,,3.0,5.0,,,4.0,,,,...,,,,,,,,,,


In [7]:
user_num = len(user_movie_matrix.index)
item_num = len(user_movie_matrix.columns)
non_null_num = user_num * item_num - user_movie_matrix.isnull().sum().sum()
non_null_ratio = non_null_num / (user_num * item_num)

print(f"User#: {user_num}, Items#:{item_num}, density={non_null_ratio}")

User#: 100, Items#:1213, density=0.08671887881286068


In [8]:
user_movie_matrix = user_movie_matrix.fillna(0)

In [9]:
from src.models.base import BaseModel
import scipy


class SVDRatingModel(BaseModel):
    def __init__(self, movielens_train: pd.DataFrame, **kwargs) -> None:
        fill_with_zero = kwargs.get("fill_with_zero", True)
        self.factors = kwargs.get("factors", 5)

        self.average_rating = movielens_train.rating.mean()
        self.user_movie_matrix = movielens_train.pivot(index="user_id", columns="movie_id", values="rating")
        self.user_id_indices = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
        self.movie_id_indices = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))

        if fill_with_zero:
            matrix = self.user_movie_matrix.fillna(0).to_numpy()
        else:
            matrix = self.user_movie_matrix.fillna(train.rating.mean()).to_numpy()
        self.matrix = matrix

        self.P, self.S, self.Qt = self._get_svd_matrix(self.matrix)
        self.pred_matrix = self._get_pred_matrix(self.P, self.S, self.Qt)
    
    def _get_svd_matrix(self, matrix:np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        P, S, Qt = scipy.sparse.linalg.svds(matrix, k=self.factors)
        return P, S, Qt
    
    def _get_pred_matrix(self, P:np.ndarray, S:np.ndarray, Qt:np.ndarray) -> np.ndarray:
        pred_matrix = np.dot(np.dot(P, np.diag(S)), Qt)
        return pred_matrix

    def predict(self, movielens_test:pd.DataFrame) -> np.array:
        pred_results = []

        for i, row in movielens_test.iterrows():
            user_id = row["user_id"]
            movie_id = row["movie_id"]

            if user_id not in self.user_id_indices or movie_id not in self.movie_id_indices:
                pred_results.append(self.average_rating)
                continue

            user_index = self.user_id_indices[user_id]
            movie_index = self.movie_id_indices[movie_id]
            pred_score = self.pred_matrix[user_index, movie_index]
            pred_results.append(pred_score)

        return np.array(pred_results)


In [10]:
class SVDRankingModel(BaseModel):
    def __init__(self, movielens_train:pd.DataFrame, **kwargs):
        fill_with_zero = kwargs.get("fill_with_zero", True)
        self.factors = kwargs.get("factors", 5)
        
        self.user_ids = movielens_train["user_id"].unique()
        self.user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id":list})["movie_id"].to_dict()
        self.user_movie_matrix = movielens_train.pivot(index="user_id", columns="movie_id", values="rating")
        self.user_id_indices = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
        self.movie_id_indices = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))

        if fill_with_zero:
            matrix = self.user_movie_matrix.fillna(0).to_numpy()
        else:
            matrix = self.user_movie_matrix.fillna(train.rating.mean()).to_numpy()
        self.matrix = matrix

        self.P, self.S, self.Qt = self._get_svd_matrix(self.matrix)
        self.pred_matrix = self._get_pred_matrix(self.P, self.S, self.Qt)
    
    def _get_svd_matrix(self, matrix:np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        P, S, Qt = scipy.sparse.linalg.svds(matrix, k=self.factors)
        return P, S, Qt
    
    def _get_pred_matrix(self, P:np.ndarray, S:np.ndarray, Qt:np.ndarray) -> np.ndarray:
        pred_matrix = np.dot(np.dot(P, np.diag(S)), Qt)
        return pred_matrix

    def predict(self, movielens_test: pd.DataFrame):
        pred_user2item = defaultdict(list)
        
        for user_id in self.user_ids:
            if user_id not in self.user_id_indices:
                continue

            user_index = self.user_id_indices[user_id]
            movie_indexes = np.argsort(-self.pred_matrix[user_index, :])
            for movie_index in movie_indexes:
                movie_id = self.user_movie_matrix.columns[movie_index]
                if movie_id not in self.user_evaluated_movies[user_id]:
                    pred_user2item[user_id].append(movie_id)
                if len(pred_user2item[user_id]) >= 10:
                    break

        return pred_user2item

In [11]:
svd_rating_model = SVDRatingModel(train)
pred_ratings = svd_rating_model.predict(test)

In [15]:
svd_ranking_model = SVDRankingModel(train)
pred_ranking = svd_ranking_model.predict(test)

In [17]:
print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))
print(
    "Test Precision@k",
    RecSysMetrics().calc_precision_at_k(
        rank_test, pred_ranking, 10
    ),
)

print(
    "Test Recall@k",
    RecSysMetrics().calc_recall_at_k(
        rank_test, pred_ranking, 10
    ),
)

Test MAE rating 2.8302094345602127
Test MSE rating 9.730878856897187
Test RMSE rating 3.119435663208521
Test Precision@k 0.055056179775280906
Test Recall@k 0.16441947565543072


In [19]:
# 인자 수와 정밀도의 관계
for factors in [5, 10, 30]:
    svd_rating_model = SVDRatingModel(train, factors=factors, fill_with_zero=False)
    pred_ratings = svd_rating_model.predict(test)
    svd_ranking_model = SVDRankingModel(train, factors=factors, fill_with_zero=False)
    pred_ranking = svd_ranking_model.predict(test)

    print(f"=========== Factors - { factors} ===============")
    print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
    print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
    print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))
    print(
        "Test Precision@k",
        RecSysMetrics().calc_precision_at_k(
            rank_test, pred_ranking, 10
        ),
    )

    print(
        "Test Recall@k",
        RecSysMetrics().calc_recall_at_k(
            rank_test, pred_ranking, 10
        ),
    )

Test MAE rating 1.0112049765436821
Test MSE rating 1.5041795135881053
Test RMSE rating 1.2264499637523356
Test Precision@k 0.03820224719101124
Test Recall@k 0.09887640449438204
Test MAE rating 1.0132421510943277
Test MSE rating 1.5159922178649146
Test RMSE rating 1.2312563574921815
Test Precision@k 0.0348314606741573
Test Recall@k 0.10056179775280898
Test MAE rating 1.0379181661256571
Test MSE rating 1.5817536385523465
Test RMSE rating 1.2576778755119875
Test Precision@k 0.029213483146067417
Test Recall@k 0.07359550561797754
