In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD, dump
from surprise.model_selection import GridSearchCV
import pickle

In [28]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [29]:
df = pd.read_csv("ratings.csv").drop(["timestamp"], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [30]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, train_size=0.8)

print("Training set size: ", trainset.shape)


print("Test set size: ", testset.shape)

Training set size:  (80668, 3)
Test set size:  (20168, 3)


In [31]:
reader = Reader(rating_scale = (0.5, 5.0))

train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [32]:
test_data = test_data.build_full_trainset()

In [33]:
test_data = test_data.build_testset()

In [34]:
# final_algo = SVD()

In [35]:
param_grid = {"n_epochs": [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.002, 0.005], "n_factors":[50, 100, 200]}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=10, n_jobs=5, joblib_verbose=5)
gs.fit(train_data)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    2.1s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   13.2s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   36.0s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 360 out of 360 | elapsed:  2.5min finished


In [36]:
print(gs.best_score["rmse"])

0.8809920921873241


In [37]:
print(gs.best_params["rmse"])

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.005, 'n_factors': 50}


In [38]:
model = gs.best_estimator["rmse"]

train_data = train_data.build_full_trainset()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ed7f2070d0>

In [39]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8793


0.8792624748288435

In [40]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,495,140247,5.0,3.555917,{'was_impossible': False},1.444083
1,495,736,3.5,3.743618,{'was_impossible': False},0.243618
2,495,55765,4.5,4.342053,{'was_impossible': False},0.157947
3,495,102123,2.5,3.942421,{'was_impossible': False},1.442421
4,495,42723,2.0,3.546237,{'was_impossible': False},1.546237


In [41]:
best_predictions = df_predictions.sort_values(by='err')[:10]

In [42]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
19707,251,318,5.0,5.0,{'was_impossible': False},0.0
20111,371,7361,5.0,5.0,{'was_impossible': False},0.0
1971,601,2959,5.0,5.0,{'was_impossible': False},0.0
16882,337,110,5.0,5.0,{'was_impossible': False},0.0
6549,1,1213,5.0,5.0,{'was_impossible': False},0.0
4586,610,1208,5.0,5.0,{'was_impossible': False},0.0
14879,43,1,5.0,5.0,{'was_impossible': False},0.0
10157,51,2762,5.0,5.0,{'was_impossible': False},0.0
14723,523,318,5.0,5.0,{'was_impossible': False},0.0
14474,380,296,5.0,5.0,{'was_impossible': False},0.0


In [43]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]

In [44]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
7437,105,4027,0.5,4.316206,{'was_impossible': False},3.816206
19407,258,87232,0.5,4.31778,{'was_impossible': False},3.81778
2804,594,4902,0.5,4.331845,{'was_impossible': False},3.831845
19406,258,122886,0.5,4.346813,{'was_impossible': False},3.846813
3859,239,48394,0.5,4.35996,{'was_impossible': False},3.85996
7873,580,1250,0.5,4.37647,{'was_impossible': False},3.87647
14946,344,3949,0.5,4.44118,{'was_impossible': False},3.94118
18598,393,1732,0.5,4.466596,{'was_impossible': False},3.966596
11447,573,44199,0.5,4.581613,{'was_impossible': False},4.081613
16758,210,296,0.5,4.653013,{'was_impossible': False},4.153013


In [45]:
# Now fitting to full data
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ed7f2070d0>

In [46]:
model.qi.shape

(9724, 50)

In [47]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.046375,0.035137,0.275108,0.114336,-0.029113,-0.143468,-0.394586,0.232917,-0.261401,0.197431,...,0.132923,-0.052545,-0.109873,-0.002176,-0.069313,-0.092227,-0.057306,0.060087,-0.333394,-0.057772
1,-0.030329,-0.208578,-0.005814,-0.036075,-0.117616,-0.099115,0.178224,-0.215238,0.024827,-0.150896,...,0.045552,0.086894,-0.116602,0.020857,0.059477,-0.066864,0.192317,0.067839,-0.313468,0.216341
2,-0.119633,0.101418,0.221838,0.065425,0.152422,0.046307,-0.065213,0.039372,0.079717,0.062732,...,-0.070083,-0.360324,-0.214929,0.186548,0.125016,0.073838,0.038720,-0.291139,0.098449,0.220331
3,-0.079395,-0.241796,0.262668,-0.011262,0.018619,0.090246,0.216339,0.155999,-0.028819,-0.126137,...,0.061846,0.191087,0.253908,-0.248257,0.013707,0.351706,0.246504,0.187586,-0.105514,0.183066
4,0.146482,-0.171005,0.047631,0.085602,-0.129138,-0.044266,-0.068288,0.124471,0.020131,0.356696,...,0.009984,-0.181597,0.039148,-0.144030,0.066643,0.216972,-0.065295,-0.010065,-0.013412,0.008837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.057013,0.067799,-0.208753,0.023802,0.046297,-0.041874,0.178614,-0.095875,-0.054358,-0.030225,...,-0.119815,-0.141186,0.094243,0.007744,-0.102161,0.161268,0.064932,-0.071643,-0.047533,-0.128244
9720,0.052207,0.159905,-0.042268,0.005411,-0.018554,-0.038413,0.304435,-0.118293,0.052022,-0.049606,...,-0.108141,-0.055252,0.091077,0.100038,-0.097807,0.048632,0.127197,-0.029449,-0.157708,-0.018412
9721,0.149365,-0.185050,-0.041102,-0.104149,0.125395,0.126475,0.034028,0.023661,0.132224,0.000397,...,-0.009309,-0.114415,0.192321,-0.016075,0.083659,0.029819,0.185050,0.008115,0.014870,-0.169365
9722,-0.022308,-0.049131,0.003757,-0.056654,-0.042330,0.097027,0.006835,0.017088,-0.174482,0.101632,...,-0.205961,0.008466,0.080811,0.036919,-0.017659,-0.066031,-0.023332,-0.052917,0.155549,0.193052


In [48]:
# Dumping to file
pickle.dump(model, open("svd_model.sav", "wb"))