In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD, dump
from surprise.model_selection import GridSearchCV
import pickle

In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
df = pd.read_csv("ratings.csv").drop(["timestamp"], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, train_size=0.8)

print("Training set size: ", trainset.shape)


print("Test set size: ", testset.shape)

Training set size:  (80668, 3)
Test set size:  (20168, 3)


In [5]:
reader = Reader(rating_scale = (0.5, 5.0))

train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [6]:
test_data = test_data.build_full_trainset()

In [7]:
test_data = test_data.build_testset()

In [8]:
# final_algo = SVD()

In [9]:
param_grid = {"n_epochs": [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.002, 0.005], "n_factors":[50, 100, 200]}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=10, n_jobs=5, joblib_verbose=15)
gs.fit(train_data)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=5)]: Done   2 tasks      | elapsed:    1.5s
[Parallel(n_jobs=5)]: Done   3 tasks      | elapsed:    1.6s
[Parallel(n_jobs=5)]: Done   4 tasks      | elapsed:    1.7s
[Parallel(n_jobs=5)]: Done   5 tasks      | elapsed:    1.8s
[Parallel(n_jobs=5)]: Done   6 tasks      | elapsed:    2.1s
[Parallel(n_jobs=5)]: Done   7 tasks      | elapsed:    2.2s
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    2.3s
[Parallel(n_jobs=5)]: Done   9 tasks      | elapsed:    2.4s
[Parallel(n_jobs=5)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=5)]: Done  11 tasks      | elapsed:    3.0s
[Parallel(n_jobs=5)]: Done  12 tasks      | elapsed:    3.1s
[Parallel(n_jobs=5)]: Done  13 tasks      | elapsed:    3.2s
[Parallel(n_jobs=5)]: Done  14 tasks      | elapsed:    3.3s
[Parallel(n_jobs=5)]: Done  15 tasks      | elapsed:    3.3s
[Parallel(

In [10]:
print(gs.best_score["rmse"])

0.8819076542167721


In [11]:
print(gs.best_params["rmse"])

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.005, 'n_factors': 50}


In [12]:
model = gs.best_estimator["rmse"]

train_data = train_data.build_full_trainset()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ed248446a0>

In [13]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8776


0.8776393391522344

In [15]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,305,4223,5.0,3.773142,{'was_impossible': False},1.226858
1,305,60514,3.5,3.70768,{'was_impossible': False},0.20768
2,305,164179,4.0,4.146625,{'was_impossible': False},0.146625
3,305,96588,4.0,3.615363,{'was_impossible': False},0.384637
4,305,104913,3.5,4.319744,{'was_impossible': False},0.819744


In [16]:
best_predictions = df_predictions.sort_values(by='err')[:10]

In [17]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
5752,380,1201,5.0,5.0,{'was_impossible': False},0.0
13587,93,260,5.0,5.0,{'was_impossible': False},0.0
16600,543,2959,5.0,5.0,{'was_impossible': False},0.0
12758,122,318,5.0,5.0,{'was_impossible': False},0.0
12711,122,1210,5.0,5.0,{'was_impossible': False},0.0
13013,573,2028,5.0,5.0,{'was_impossible': False},0.0
5916,380,1196,5.0,5.0,{'was_impossible': False},0.0
12741,122,2502,5.0,5.0,{'was_impossible': False},0.0
1264,57,260,5.0,5.0,{'was_impossible': False},0.0
15758,1,1136,5.0,5.0,{'was_impossible': False},0.0


In [18]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]

In [19]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
17857,598,593,0.5,4.170016,{'was_impossible': False},3.670016
9664,594,7883,0.5,4.235238,{'was_impossible': False},3.735238
9658,594,4902,0.5,4.242631,{'was_impossible': False},3.742631
19990,161,4002,0.5,4.261774,{'was_impossible': False},3.761774
12987,573,44199,0.5,4.294795,{'was_impossible': False},3.794795
7274,393,5902,0.5,4.301672,{'was_impossible': False},3.801672
15971,256,7099,0.5,4.436323,{'was_impossible': False},3.936323
7249,393,778,0.5,4.457107,{'was_impossible': False},3.957107
7254,393,541,0.5,4.696924,{'was_impossible': False},4.196924
7275,393,1732,0.5,4.741386,{'was_impossible': False},4.241386


In [20]:
# Now fitting to full data
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ed248446a0>

In [26]:
model.qi.shape

(9724, 50)

In [23]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.073986,0.053071,-0.033575,0.135694,-0.322271,-0.085245,0.189804,-0.011747,0.065134,0.048754,...,0.143637,0.151285,-0.289457,0.004056,0.166425,-0.114409,-0.026170,-0.231058,-0.424530,-0.211402
1,0.051451,0.149306,-0.110733,0.027474,-0.155443,-0.155321,0.095495,-0.012975,0.105964,-0.207291,...,-0.022500,0.032600,-0.104472,0.105719,0.054994,-0.008755,-0.019868,0.105745,0.041135,-0.038355
2,-0.146880,-0.074067,0.014557,0.182635,-0.097076,-0.050668,0.148557,-0.005060,-0.020161,-0.083801,...,0.041150,0.093634,0.020473,0.019716,-0.117791,-0.074351,-0.205901,0.222155,0.223995,-0.021224
3,-0.252130,-0.134436,-0.086383,0.027432,-0.436641,0.094401,0.019020,0.036893,0.013299,0.371474,...,-0.198159,-0.056673,-0.242901,-0.310904,0.260372,0.201218,0.256766,-0.161984,0.249172,-0.075514
4,0.117904,0.105083,-0.104873,0.211366,-0.288690,-0.033557,0.119932,-0.000992,0.085671,-0.052762,...,0.000313,-0.064629,-0.142586,-0.061415,0.014794,0.258193,0.022157,-0.056992,0.186399,0.008951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,-0.066135,-0.167462,0.130243,0.127663,0.170505,-0.062797,0.103536,-0.044779,0.055624,-0.098218,...,-0.007330,0.060412,-0.142038,0.007922,-0.010454,-0.125641,0.029036,0.043230,0.010594,-0.013601
9720,-0.117756,0.172477,0.049312,0.040995,0.011813,0.022041,-0.007644,0.072139,0.005453,-0.095326,...,-0.142094,0.094443,-0.005806,-0.081499,0.072178,-0.018264,-0.046845,0.013248,-0.093921,-0.063536
9721,0.012350,0.093091,0.016754,-0.104622,-0.209150,0.095527,0.111181,0.060083,0.095867,-0.048388,...,0.049958,0.067006,-0.081220,-0.149193,-0.318880,-0.106957,-0.052134,-0.086367,-0.004094,0.164485
9722,-0.090215,0.110481,-0.054435,0.080715,0.065266,-0.053427,-0.014352,0.197556,0.028753,-0.032861,...,-0.200777,-0.052249,-0.190503,0.156179,0.081906,-0.050029,-0.023284,-0.068815,0.104371,0.050225


In [22]:
# Dumping to file
pickle.dump(model, open("svd_model.sav", "wb"))