In [5]:
import os
import sys
from collections import defaultdict
from pathlib import Path

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics

import warnings

warnings.filterwarnings("ignore")

In [6]:
movielens_loader = MovieLensLoader(
    num_users=1000,
    num_test_items=5,
)

In [7]:
moivelens_dataset = movielens_loader.load()

In [8]:
train = moivelens_dataset.train
test = moivelens_dataset.test
rank_test = moivelens_dataset.test_user2item

In [9]:
from sklearn.decomposition import NMF

fill_with_zero = True
factors = 5

In [10]:
user_movie_matrix = train.pivot(index="user_id", columns="movie_id", values="rating")
user_id2index = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
movie_id2index = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))

if fill_with_zero:
    matrix = user_movie_matrix.fillna(0).to_numpy()
else:
    matrix = user_movie_matrix.fillna(train.rating.mean()).to_numpy()

matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [11]:
# NMF 실행
nmf = NMF(n_components=factors)
nmf.fit(matrix)

In [13]:
P = nmf.fit_transform(matrix)
P

array([[1.23990653, 0.31246393, 0.51421273, 0.22765567, 0.05657927],
       [0.        , 0.47408514, 0.04588169, 0.        , 0.63967054],
       [0.00199947, 0.        , 0.        , 0.        , 0.67250722],
       ...,
       [0.04263422, 0.22626265, 0.        , 0.        , 0.09807376],
       [0.        , 0.        , 0.35755684, 0.26135812, 0.53290845],
       [0.97620508, 0.1731312 , 0.        , 0.41351802, 0.        ]])

In [14]:
Q = nmf.components_
Q

array([[7.31458819e-01, 1.44216234e+00, 8.74800620e-01, ...,
        0.00000000e+00, 1.74400306e-02, 2.66057860e-02],
       [4.32218805e+00, 0.00000000e+00, 8.23852483e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.28622253e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 5.12115669e-04, 0.00000000e+00],
       [2.43871319e+00, 1.06750332e+00, 0.00000000e+00, ...,
        0.00000000e+00, 5.41606661e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.47070788e-02, 0.00000000e+00, 0.00000000e+00]])

In [15]:
P.shape, Q.shape

((943, 5), (5, 1671))

In [16]:
# 예측 평갓값 행렬
pred_matrix = np.dot(P, Q)
pred_matrix

array([[3.23874343e+00, 2.03116969e+00, 1.34209519e+00, ...,
        1.39790860e-03, 2.31203426e-02, 3.29886878e-02],
       [2.08710370e+00, 0.00000000e+00, 3.90576217e-01, ...,
        1.58043903e-02, 2.34967346e-05, 0.00000000e+00],
       [1.46252684e-03, 2.88355418e-03, 1.74913386e-03, ...,
        1.66156888e-02, 3.48707436e-05, 5.31973573e-05],
       ...,
       [1.00913489e+00, 6.14854604e-02, 2.23703484e-01, ...,
        2.42311611e-03, 7.43542029e-04, 1.13431682e-03],
       [9.33657064e-01, 2.79000667e-01, 0.00000000e+00, ...,
        1.31666112e-02, 1.59864347e-03, 0.00000000e+00],
       [2.47081128e+00, 1.84927806e+00, 9.96619377e-01, ...,
        0.00000000e+00, 1.92646876e-02, 2.59727033e-02]])

In [17]:
pred_matrix.shape

(943, 1671)

In [24]:
average_score = train.rating.mean()
pred_results = []
for i, row in test.iterrows():
    user_id = row["user_id"]
    movie_id = row["movie_id"]
    if user_id not in user_id2index or movie_id not in movie_id2index:
        pred_results.append(average_score)
        continue

    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    pred_score = pred_matrix[user_index, movie_index]
    pred_results.append(pred_score)

pred_ratings = np.array(pred_results)


In [22]:
pred_user2items = defaultdict(list)
user_evaluated_matrix = train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()


In [23]:
for user_id in train.user_id.unique():
    if user_id not in user_id2index:
        continue
    user_index = user_id2index[row["user_id"]]
    movies_indexes = np.argsort(-pred_matrix[user_index, :])

    for movie_index in movies_indexes:
        movie_id = user_movie_matrix.columns[movie_index]
        if movie_id not in user_evaluated_matrix[user_id]:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break

In [25]:
print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))
print(
    "Test Precision@k",
    RecSysMetrics().calc_precision_at_k(
        rank_test, pred_user2items, 5
    ),
)

print(
    "Test Recall@k",
    RecSysMetrics().calc_recall_at_k(
        rank_test, pred_user2items, 5
    ),
)

Test MAE rating 2.743829321576108
Test MSE rating 9.064344027737222
Test RMSE rating 3.0107049054560666
Test Precision@k 0.03263803680981595
Test Recall@k 0.05503067484662576


In [27]:
# user_id=2의 사용자가 학습 데이터에 평가를 부여한 영화 목록
train[train.user_id==2]["movie_title"]

700                                      Rosewood (1997)
924                               Shall We Dance? (1996)
1052                                    Star Wars (1977)
5063                                  Ulee's Gold (1997)
5324                             Fierce Creatures (1997)
6310      Midnight in the Garden of Good and Evil (1997)
7973                             Mighty Aphrodite (1995)
8253                        Up Close and Personal (1996)
9201                                  Ulee's Gold (1997)
10683                       Devil's Advocate, The (1997)
11989                                Men in Black (1997)
13440                                   Apt Pupil (1998)
16025                                    In & Out (1997)
16198                                     Titanic (1997)
16230    Once Upon a Time... When We Were Colored (1995)
16596                                     Hoodlum (1997)
16823                                    Face/Off (1997)
17425                          

In [28]:
pred_user2items[2]

[328, 333, 748, 268, 245, 340, 326, 117, 690, 322]

In [29]:
# user_id=2에 대한 추천(2959, 4993, 5952)
moivelens_dataset.item_content[moivelens_dataset.item_content.movie_id.isin([328, 333, 748])]

Unnamed: 0,movie_id,movie_title,release_date,unknown,action,adventure,animation,childrens,comedy,crime,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
327,328,Conspiracy Theory (1997),1997-08-08,0,1,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
332,333,"Game, The (1997)",1997-01-01,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
747,748,"Saint, The (1997)",1997-03-14,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
