In [22]:
import os
import sys

from pathlib import Path
from collections import defaultdict

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics

import warnings

warnings.filterwarnings("ignore")

In [16]:
movielens_loader = MovieLensLoader(
    num_users=1000,
    num_test_items=5,
)

In [17]:
moivelens_dataset = movielens_loader.load()

In [18]:
train = moivelens_dataset.train
test = moivelens_dataset.test
rank_test = moivelens_dataset.test_user2item

In [19]:
train["user_id"].nunique(), test["user_id"].nunique()

(943, 943)

In [20]:
train["movie_id"].nunique(), test["movie_id"].nunique()

(1671, 1080)

In [21]:
def pearson_coefficient(u: np.ndarray, v: np.ndarray) -> float:
    u_diff = u - np.mean(u)
    v_diff = v - np.mean(v)
    numerator = np.dot(u_diff, v_diff)
    denominator = np.sqrt(sum(u_diff ** 2)) * np.sqrt(sum(v_diff ** 2))
    if denominator == 0:
        return 0.0
    return numerator / denominator

In [23]:
user_movie_matrix = train.pivot(index="user_id", columns="movie_id", values="rating")

In [25]:
user_id_indices = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
movie_id_indices = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))

In [27]:
movie_rating_predict = test.copy()
pred_user2items = defaultdict(list)

In [37]:
test_users = movie_rating_predict.user_id.unique()

for user1_id in test_users:
    similar_users = []
    similarities = []
    avgs = []

    for user2_id in user_movie_matrix.index:
        if user1_id == user2_id:
            continue

        u_1 = user_movie_matrix.loc[user1_id, :].to_numpy()
        u_2 = user_movie_matrix.loc[user2_id, :].to_numpy()

        # 유저가 동시에 평가한 항목만 추출
        common_items = ~np.isnan(u_1) & ~np.isnan(u_2)

        # 동시에 평가한 항목이 없으면 스킵
        if not common_items.any():
            continue

        u_1, u_2 = u_1[common_items], u_2[common_items]

        rho_12 = pearson_coefficient(u_1, u_2)

        if rho_12 > 0:
            similar_users.append(user2_id)
            similarities.append(rho_12)
            avgs.append(np.mean(u_2))


    # 사용자 1의 평균 평갓값
    avg_1 = np.mean(user_movie_matrix.loc[user1_id, :].dropna().to_numpy())

    test_movies = movie_rating_predict[movie_rating_predict["user_id"] == user1_id].movie_id.values
    # 예측할 수 없는 영화에 대한 평갓값은 사용자 1의 평균 평갓값으로 한다
    movie_rating_predict.loc[(movie_rating_predict["user_id"] == user1_id), "rating_pred"] = avg_1

    if similar_users:
        for movie_id in test_movies:
            if movie_id in movie_id_indices:
                r_xy = user_movie_matrix.loc[similar_users, movie_id].to_numpy()
                rating_exists = ~np.isnan(r_xy)

                # 유사 사용자가 대상이 되는 영화에 대한 평갓값을 갖지 않은 경우는 스킵한다
                if not rating_exists.any():
                    continue
            
                r_xy = r_xy[rating_exists]
                rho_1x = np.array(similarities)[rating_exists]
                avg_x = np.array(avgs)[rating_exists]
                r_hat_1y = avg_1 + np.dot(rho_1x, (r_xy - avg_x)) / rho_1x.sum()

                # 예측 평갓값을 저장
                movie_rating_predict.loc[
                    (movie_rating_predict["user_id"] == user1_id)
                    & (movie_rating_predict["movie_id"] == movie_id),
                    "rating_pred",
                ] = r_hat_1y
    break # 계산이 무거우므로, for 루프는 1번만 수행한 뒤 종료합니다. 각 변수에 어떤 값이 들어있는지 확인하면 알고리즘을 더 깊이 이해할 수 있습니다.

In [41]:
from surprise import KNNWithMeans, Reader
from surprise import Dataset as SurpriseDataset

In [42]:
from src.models.base import BaseModel

class UserMemoryModel(BaseModel):
    def __init__(self, movielens_train:pd.DataFrame) -> None:
        self.user_movie_matrix = movielens_train.pivot(index="user_id", columns="movie_id", values="rating")
        self.user_id_indices = dict(zip(self.user_movie_matrix.index, range(len(self.user_movie_matrix.index))))
        self.movie_id_indices = dict(zip(self.user_movie_matrix.columns, range(len(self.user_movie_matrix.columns))))

        reader = Reader(rating_scale=(0.5, 5))
        self.train_set = SurpriseDataset.load_from_df(movielens_train[["user_id", "movie_id", "rating"]], reader).build_full_trainset()
        sim_options = {
            "name": "pearson",
            "user_based": True
        }
        self.average_score = movielens_train.rating.mean()

        self.knn = KNNWithMeans(k=30, min_k=1, sim_options=sim_options)
        self.knn.fit(self.train_set)

    def _get_top_n(self, predictions:np.ndarray, n=10) -> dict[list[float]]:
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = [d[0] for d in user_ratings[:n]]
        
        return top_n

    def predict(self, movielens_test: pd.DataFrame):
        movie_rating_predict = movielens_test.copy()
        pred_user2items = defaultdict(list)
        data_test = self.train_set.build_anti_testset(None)
        predictions = self.knn.test(data_test)

        pred_user2items = self._get_top_n(predictions, n=10)

        pred_results = []

        for _, row in movielens_test.iterrows():
            user_id = row["user_id"]
            movie_id = row["movie_id"]

            if user_id not in self.user_id_indices or movie_id not in self.movie_id_indices:
                pred_results.append(self.average_score)
                continue

            pred_score = self.knn.predict(uid=user_id, iid=movie_id).est
            pred_results.append(pred_score)
        
        return pred_user2items, pred_results

In [43]:
user_memory_model = UserMemoryModel(train)
pred_ranking, pred_ratings = user_memory_model.predict(test)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [44]:
print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))
print(
    "Test Precision@k",
    RecSysMetrics().calc_precision_at_k(
        moivelens_dataset.test_user2item, pred_ranking, 10
    ),
)

print(
    "Test Recall@k",
    RecSysMetrics().calc_recall_at_k(
        moivelens_dataset.test_user2item, pred_ranking, 10
    ),
)

Test MAE rating 0.8160624924710785
Test MSE rating 1.0737931383678587
Test RMSE rating 1.0362399038677572
Test Precision@k 0.0017177914110429449
Test Recall@k 0.004826175869120654
