In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
import os
import pandas as pd
import numpy as np
from math import sqrt
from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [25]:
# only ratings_df
path = '/content/drive/MyDrive/study/Recsys/data/movielens'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

print(ratings_df.shape)
print(ratings_df.head())

(100836, 4)
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [26]:
train_df, test_df = train_test_split(ratings_df, test_size = 0.2, random_state = 918)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [27]:
user_ids = sorted(list(set(train_df['userId'].values)))
movie_ids = sorted(list(set(train_df['movieId'].values)))

print('User : ', len(user_ids), '\nmovies : ', len(movie_ids))

User :  610 
movies :  8964


SparseMatrix

In [28]:
# empty (all values are nan)
fun = lambda x : pd.Series(x['rating'].values, index = x['userId'])
sparse_matrix = train_df.groupby('movieId').apply(fun).unstack()

sparse_matrix


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,,3.0,4.0,2.5,,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,,,
3,4.0,,,,,,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,,,,,,,,,,,...,,,,,,,,,,
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,


In [29]:
from sklearn.metrics.pairwise import cosine_similarity

'''
both are data frame
each rows are considered as target vectors
'''
def cossim_matrix(a, b):
    cossim_values = cosine_similarity(a.values, b.values)
    cossim_df = pd.DataFrame(data = cossim_values, columns = a.index.values, index=  a.index.values)
    return cossim_df

## Item based

In [30]:
item_sparse_matrix = sparse_matrix.fillna(0)
item_sparse_matrix.shape

(8964, 610)

In [31]:
item_cossim_df = cossim_matrix(item_sparse_matrix, item_sparse_matrix)
item_cossim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,190219,190221,191005,193565,193573,193579,193581,193583,193585,193609
1,1.000000,0.321963,0.20695,0.013500,0.232478,0.309913,0.225331,0.098311,0.155804,0.263749,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.321963,1.000000,0.21439,0.096060,0.204135,0.231215,0.158393,0.116928,0.013858,0.340009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.206950,0.214390,1.00000,0.000000,0.257930,0.201653,0.330899,0.266208,0.233742,0.219935,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.013500,0.096060,0.00000,1.000000,0.266847,0.059781,0.189903,0.188951,0.000000,0.009791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.232478,0.204135,0.25793,0.266847,1.000000,0.218820,0.379777,0.335888,0.307969,0.111070,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193579,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193581,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193585,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [32]:
userId_grouped = train_df.groupby('userId')

# empty yet
item_prediction_result_df = pd.DataFrame(index = list(userId_grouped.indices.keys()), columns=item_sparse_matrix.index)
item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,190221,191005,193565,193573,193579,193581,193583,193585,193609
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [33]:
# predicts!

b = True
for userId, group in userId_grouped:
    if b:
        print(userId)
        print(group)
        break

1
     userId  movieId  rating  timestamp
129       1     2093     3.0  964981710
102       1     1587     5.0  964982346
163       1     2528     3.0  964982328
193       1     2985     4.0  964983034
104       1     1620     4.0  964983056
..      ...      ...     ...        ...
226       1     3740     4.0  964982417
172       1     2640     4.0  964982377
36        1      608     5.0  964982931
124       1     2048     5.0  964982791
61        1     1080     5.0  964981327

[178 rows x 4 columns]


$$ r_{j} = (\sum_{i \in R} S_{ij}*r_{i}) / \sum_{i \in R} S_{ij}   $$

In [34]:
for userId, group in tqdm(userId_grouped):

    # (n, total_movies)
    user_sim = item_cossim_df.loc[group['movieId']]
    # items rated by each user

    # (n, 1) ratings,
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis = 0)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    item_prediction_result_df.loc[userId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/610 [00:00<?, ?it/s]

In [35]:
item_prediction_result_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190219,190221,191005,193565,193573,193579,193581,193583,193585,193609
1,4.245454,4.199602,4.200696,3.669287,4.042485,4.230487,4.017304,4.054092,3.750791,4.18531,...,0.833139,0.833139,0.395982,0.395982,0.395982,0.395982,0.395982,0.395982,0.395982,1.953733
2,3.173757,3.136163,2.429528,0.307396,2.420593,2.96751,1.911314,2.371362,0.992861,2.928666,...,0.5479,0.5479,1.796507,1.796507,1.796507,1.796507,1.796507,1.796507,1.796507,2.307566
3,1.195142,1.056003,1.198985,0.260133,0.74083,1.417197,0.727583,0.721161,0.759593,1.261959,...,0.040426,0.040426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.406241,3.420726,3.346141,3.162727,3.304792,3.380703,3.229303,3.207459,2.843614,3.317835,...,0.799283,0.799283,0.352484,0.352484,0.352484,0.352484,0.352484,0.352484,0.352484,1.541974
5,3.355617,3.250043,3.086008,2.944127,3.140815,3.259643,3.114235,3.069289,2.301146,3.210939,...,0.710212,0.710212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.584646


User-based

In [36]:
user_sparse_matrix = sparse_matrix.fillna(0).transpose()

In [37]:
user_cossim_df = cossim_matrix(user_sparse_matrix, user_sparse_matrix)
user_cossim_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,1.000000,0.036790,0.051775,0.179190,0.115050,0.083430,0.110884,0.099326,0.048800,0.019322,...,0.052276,0.133784,0.199593,0.075287,0.085019,0.137055,0.147043,0.225377,0.078860,0.137214
2,0.036790,1.000000,0.000000,0.000000,0.021816,0.014019,0.034938,0.034853,0.000000,0.085986,...,0.165323,0.022401,0.008452,0.000000,0.000000,0.015677,0.017283,0.018994,0.034212,0.110347
3,0.051775,0.000000,1.000000,0.002967,0.000000,0.001503,0.000000,0.006228,0.000000,0.000000,...,0.003538,0.004003,0.017276,0.000000,0.013905,0.011951,0.006795,0.014087,0.000000,0.023640
4,0.179190,0.000000,0.002967,1.000000,0.092617,0.075661,0.101372,0.058892,0.000000,0.009689,...,0.063166,0.094913,0.263599,0.040553,0.049994,0.141412,0.076281,0.120714,0.034513,0.085740
5,0.115050,0.021816,0.000000,0.092617,1.000000,0.240007,0.087670,0.309893,0.000000,0.038192,...,0.055765,0.256088,0.094964,0.177604,0.100779,0.077112,0.091633,0.090473,0.281048,0.056253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.137055,0.015677,0.011951,0.141412,0.077112,0.084421,0.151863,0.074110,0.047575,0.069839,...,0.150224,0.076013,0.241689,0.055948,0.122781,1.000000,0.110157,0.200932,0.062532,0.164599
607,0.147043,0.017283,0.006795,0.076281,0.091633,0.104518,0.154138,0.147306,0.017302,0.000000,...,0.068868,0.139372,0.137295,0.093657,0.093134,0.110157,1.000000,0.212038,0.089812,0.116161
608,0.225377,0.018994,0.014087,0.120714,0.090473,0.141251,0.244986,0.145796,0.070671,0.062861,...,0.114882,0.148964,0.201690,0.118871,0.148315,0.200932,0.212038,1.000000,0.111037,0.261275
609,0.078860,0.034212,0.000000,0.034513,0.281048,0.183941,0.091876,0.330523,0.000000,0.025668,...,0.041154,0.275007,0.054045,0.150139,0.098074,0.062532,0.089812,0.111037,1.000000,0.057570


In [38]:
movieId_grouped = train_df.groupby('movieId')
# empty (movieId, userId)
user_prediction_result_df = pd.DataFrame(index = list(movieId_grouped.indices.keys()),
                                         columns = user_sparse_matrix.index)
user_prediction_result_df.shape

(8964, 610)

In [39]:
for movieId, group in tqdm(movieId_grouped) :
    user_sim = user_cossim_df.loc[group['userId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum+1)
    user_prediction_result_df.loc[movieId] = pred_ratings   

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


  0%|          | 0/8964 [00:00<?, ?it/s]

In [40]:
print(item_prediction_result_df.head())
user_prediction_result_df = user_prediction_result_df.transpose()
print(user_prediction_result_df.head())

movieId    1         2         3         4         5         6         7       \
1        4.245454  4.199602  4.200696  3.669287  4.042485  4.230487  4.017304   
2        3.173757  3.136163  2.429528  0.307396  2.420593   2.96751  1.911314   
3        1.195142  1.056003  1.198985  0.260133   0.74083  1.417197  0.727583   
4        3.406241  3.420726  3.346141  3.162727  3.304792  3.380703  3.229303   
5        3.355617  3.250043  3.086008  2.944127  3.140815  3.259643  3.114235   

movieId    8         9         10      ...    190219    190221    191005  \
1        4.054092  3.750791   4.18531  ...  0.833139  0.833139  0.395982   
2        2.371362  0.992861  2.928666  ...    0.5479    0.5479  1.796507   
3        0.721161  0.759593  1.261959  ...  0.040426  0.040426       0.0   
4        3.207459  2.843614  3.317835  ...  0.799283  0.799283  0.352484   
5        3.069289  2.301146  3.210939  ...  0.710212  0.710212       0.0   

movieId    193565    193573    193579    193581    19358

## Evaluation

In [41]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
66538,428,3300,3.0,1111489290
12794,82,1372,3.5,1084463754
17145,109,356,5.0,841106550
95040,599,145745,2.5,1519150950
66059,425,2600,4.5,1114173199


In [45]:
def evaluate(test_df, prediction_result_df):

    # test case
    groups_with_movieId = test_df.groupby(by = 'movieId')
    groups_with_userId = test_df.groupby(by = 'userId')
    intersection_movieId = sorted(list(set(
        list(prediction_result_df.columns)).intersection(
            set(list(groups_with_movieId.indices.keys()))
        )))
    intersection_userId = sorted(list(set(
        list(prediction_result_df.index)).intersection(
            set(list(groups_with_userId.indices.keys()))
        )))
    compressed_prediction_df = prediction_result_df.loc[intersection_userId][intersection_movieId]

    grouped = test_df.groupby(by = 'userId')
    result_df = pd.DataFrame(columns = ['rmse'])

    for userId, group in tqdm(grouped):
        pred_ratings = compressed_prediction_df.loc[userId][compressed_prediction_df.loc[userId].index.intersection(list(group['movieId'].values))]
        pred_ratings = pred_ratings.to_frame(name='rating').reset_index().rename(columns={'index':'movieId','rating':'pred_rating'})
        actual_ratings = group[['rating', 'movieId']].rename(columns = {'rating':'actual_rating'})

        final_df = pd.merge(actual_ratings, pred_ratings, how='inner', on=['movieId'])
        fianl_df = final_df.round(4)

    
    return final_df


In [46]:
evaluate(test_df, user_prediction_result_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,actual_rating,movieId,pred_rating
0,3.5,6942,3.344762
1,2.0,3564,1.127326
2,5.0,5673,3.129785
3,4.0,4873,3.01009
4,5.0,3681,3.449885
...,...,...,...
218,4.0,51662,3.394938
219,4.0,69844,3.393012
220,2.0,5128,1.750478
221,4.0,157296,2.118634


In [48]:
evaluate(test_df, item_prediction_result_df)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,actual_rating,movieId,pred_rating
0,3.5,6942,3.787147
1,2.0,3564,3.585848
2,5.0,5673,3.920649
3,4.0,4873,3.833857
4,5.0,3681,3.819174
...,...,...,...
218,4.0,51662,3.745485
219,4.0,69844,3.709245
220,2.0,5128,3.720327
221,4.0,157296,3.658503


In [49]:
result_df = evaluate(test_df, user_prediction_result_df)
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/610 [00:00<?, ?it/s]

     actual_rating  movieId pred_rating
0              3.5     6942    3.344762
1              2.0     3564    1.127326
2              5.0     5673    3.129785
3              4.0     4873     3.01009
4              5.0     3681    3.449885
..             ...      ...         ...
218            4.0    51662    3.394938
219            4.0    69844    3.393012
220            2.0     5128    1.750478
221            4.0   157296    2.118634
222            3.5    36401     1.89017

[223 rows x 3 columns]
RMSE: 1.628132172022947


In [50]:
result_df = evaluate(test_df, item_prediction_result_df)
print(result_df)
print(f"RMSE: {sqrt(mean_squared_error(result_df['actual_rating'].values, result_df['pred_rating'].values))}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/610 [00:00<?, ?it/s]

     actual_rating  movieId pred_rating
0              3.5     6942    3.787147
1              2.0     3564    3.585848
2              5.0     5673    3.920649
3              4.0     4873    3.833857
4              5.0     3681    3.819174
..             ...      ...         ...
218            4.0    51662    3.745485
219            4.0    69844    3.709245
220            2.0     5128    3.720327
221            4.0   157296    3.658503
222            3.5    36401    3.645819

[223 rows x 3 columns]
RMSE: 0.8136639628544339


In [None]:
m