In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
path = '/content/drive/MyDrive/study/Recsys/data/movielens'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [4]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=1234)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [5]:
fun = lambda x : pd.Series(x['rating'].values, index = x['userId'])
sparse_matrix = train_df.groupby('movieId').apply(fun).unstack()

sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,,2.5,,5.0
2,,,,,,,,4.0,,,...,,4.0,,,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193573,,,,,,,,,,,...,,,,,,,,,,
193579,,,,,,,,,,,...,,,,,,,,,,
193581,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [6]:
m, n = sparse_matrix.shape
# movie's mean normalization
# using numpy broadcasting
movie_mean = sparse_matrix.mean(axis = 1).to_numpy().reshape(m,1) #(m, 1)
print(movie_mean.shape)
sparse_matrix_movie = sparse_matrix - movie_mean # (m, n) - (m, 1)

# user's mean normalization
user_mean = sparse_matrix.mean(axis = 0).to_numpy().reshape(1,n) # (1, n)
print(user_mean.shape)
sparse_matrix_user = sparse_matrix - user_mean # (m,n) - (1, n)

print(sparse_matrix_movie.shape)
print(sparse_matrix_user.shape)

# movie, column mean -> 0
print(sparse_matrix_movie.mean(axis=1).value_counts())
print(sparse_matrix_user.mean(axis=0).value_counts())

(8938, 1)
(1, 610)
(8938, 610)
(8938, 610)
 0.000000e+00    5973
-1.480297e-16     384
 1.480297e-16     340
-1.776357e-16     131
 1.776357e-16     123
                 ... 
-3.259370e-17       1
-5.075305e-17       1
 1.299773e-16       1
-3.045183e-16       1
 4.554761e-17       1
Length: 511, dtype: int64
 0.000000e+00    54
-1.776357e-16     7
 1.776357e-16     6
 1.268826e-16     6
-2.049643e-16     5
                 ..
 2.316987e-16     1
 1.022364e-16     1
 1.055876e-16     1
 1.570260e-16     1
-1.071939e-16     1
Length: 430, dtype: int64


In [7]:
sparse_matrix_movie.fillna(0, inplace=True)
sparse_matrix_user.fillna(0, inplace = True)

## 1. SVD

In [8]:
def get_svd(s_matrix, k=300):
    # (u, m)
    # (u, u) (u, m) (m, m)
    u, s, vh = np.linalg.svd(s_matrix.transpose())
    T =  u[:, :k] # (u, k)
    S = s[:k] * np.identity(k, np.float) # (k, k)
    Dt = vh[:k, :] # (k, m)

    # (m, k), (k, u)
    item_factors = np.transpose(np.matmul(S, Dt))
    user_factors = np.transpose(T)

    return item_factors, user_factors


In [9]:
item_factors, user_factors = get_svd(sparse_matrix_movie)
prediction_result_df = pd.DataFrame(np.matmul(item_factors, user_factors),
                                    columns = sparse_matrix_movie.columns.values,
                                    index = sparse_matrix_movie.index.values)

movie_prediction_result_df = (prediction_result_df + movie_mean).transpose() 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [10]:
movie_prediction_result_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
1,3.909361,3.390801,4.034767,2.252967,3.053849,3.981056,3.183438,2.744951,2.909203,3.50527,...,1.0,4.5,3.5,3.0,4.0,4.0,3.5,4.0,3.5,4.0
2,3.920779,3.442279,3.463699,2.242309,2.998031,3.954925,3.149367,2.752199,2.912969,3.407446,...,1.0,4.5,3.5,3.0,4.0,4.0,3.5,4.0,3.5,4.0
3,3.893132,3.384837,3.438732,2.240174,3.049229,3.937293,3.175635,2.738346,2.909693,3.515027,...,1.0,4.5,3.5,3.0,4.0,4.0,3.5,4.0,3.5,4.0
4,3.889539,3.41907,3.429841,2.24123,3.056872,3.942538,3.194788,2.74923,2.92326,3.514855,...,1.0,4.5,3.5,3.0,4.0,4.0,3.5,4.0,3.5,4.0
5,4.011288,3.403351,3.491263,2.183343,3.157149,3.949916,3.248211,2.873662,2.901603,3.254983,...,1.0,4.5,3.5,3.0,4.0,4.0,3.5,4.0,3.5,4.0


In [11]:
item_factors, user_factors = get_svd(sparse_matrix_user)
prediction_result_df = pd.DataFrame(np.matmul(item_factors, user_factors),
                                    columns = sparse_matrix_movie.columns.values,
                                    index = sparse_matrix_movie.index.values)

user_prediction_result_df = (prediction_result_df + user_mean).transpose()  

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [12]:
user_prediction_result_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,190219,191005,193565,193567,193571,193573,193579,193581,193587,193609
1,4.363306,4.366523,3.976339,4.336197,4.363159,3.983759,4.30729,4.339038,4.308069,4.25559,...,4.317692,4.320451,4.32073,4.320869,4.32059,4.32059,4.32073,4.32059,4.32073,4.321668
2,4.008439,3.943725,3.926417,3.957508,3.922849,3.969347,3.931427,3.937228,3.930802,3.877358,...,3.921218,3.945053,3.938057,3.934559,3.941555,3.941555,3.938057,3.941555,3.938057,3.940776
3,2.503444,2.499244,2.490397,2.529047,2.509503,2.520698,2.484895,2.511683,2.509168,2.516222,...,2.526179,2.516442,2.516753,2.516909,2.516597,2.516597,2.516753,2.516597,2.516753,2.516038
4,3.624571,3.636093,3.625396,3.642037,3.632429,3.643856,3.638039,3.642847,3.616649,3.632275,...,3.631354,3.632834,3.631497,3.630828,3.632165,3.632165,3.631497,3.632165,3.631497,3.632009
5,4.060155,3.583016,3.747858,3.62839,3.519627,3.535098,3.777566,3.683368,3.577456,3.646914,...,3.585529,3.63887,3.6354,3.633665,3.637135,3.637135,3.6354,3.637135,3.6354,3.637806


In [38]:
'''
prediction_result_df : (user i, movie j) ratings
'''

def evaluate(test_df, prediction_result_df):
    groups_with_movie_ids = test_df.groupby(by = 'movieId')
    groups_with_user_ids = test_df.groupby(by = 'userId')
    intersection_movie_ids = sorted(list(
        set(list( prediction_result_df.columns)).intersection(
            set(list(groups_with_movie_ids.indices.keys())))
        ))
    intersection_user_ids = sorted(list(
        set(list( prediction_result_df.index)).intersection(
            set(list(groups_with_user_ids.indices.keys())))
    ))

    print(len(intersection_movie_ids))
    print(len(intersection_user_ids))

    compressed_prediction_df = prediction_result_df.loc[intersection_user_ids][intersection_movie_ids]

    grouped = test_df.groupby('userId')
    rmse_df = pd.DataFrame(columns = ['rmse'])
    for userId, group in tqdm(grouped):
        if (userId) not in intersection_user_ids: continue

        tested_movie = list(group['movieId'].values)
        compressed_movie = compressed_prediction_df.loc[userId].index
        pred_ratings = compressed_prediction_df.loc[userId][compressed_movie.intersection(tested_movie)]
        nname = {'index' : 'movieId', 'rating' : 'pred_rating'}
        pred_ratings = pred_ratings.to_frame(name = 'rating' ).reset_index().rename(columns=nname)



        nname = {'rating' : 'actual_rating'}
        actual_ratings = group[['rating', 'movieId']].rename(columns = nname)

        final_df = pd.merge(actual_ratings, pred_ratings, how = 'inner', on = ['movieId'])
        final_df = final_df.round(4)


        if not final_df.empty:
            rmse = sqrt(mean_squared_error(final_df['actual_rating'], final_df['pred_rating']))
            rmse_df.loc[userId] = rmse

    return final_df, rmse_df     

In [41]:
result_df, rmse_df = evaluate(test_df, user_prediction_result_df)
print(result_df)
print('For User Matrix')
print(f"RMSE : {mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values)}")

4385
610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/610 [00:00<?, ?it/s]

     actual_rating  movieId  pred_rating
0              5.0     3527       3.6788
1              3.5    84772       3.6782
2              3.5   103141       3.6750
3              4.0    81132       3.6785
4              4.5   130634       3.6784
..             ...      ...          ...
218            4.0   106100       3.6807
219            4.0   111759       3.6740
220            1.0     4852       3.6788
221            3.0     2628       3.6887
222            5.0     1953       3.6721

[223 rows x 3 columns]
For User Matrix
RMSE : 0.7313722734977578


In [None]:
pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
          actual_ratings = group[['rating', 'movieId']].rename(columns={'rating':'actual_rating'})
