In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD, dump
from surprise.model_selection import GridSearchCV
import pickle

In [2]:
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
df = pd.read_csv("ratings.csv").drop(["timestamp"], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, train_size=0.8)

print("Training set size: ", trainset.shape)


print("Test set size: ", testset.shape)

Training set size:  (80668, 3)
Test set size:  (20168, 3)


In [5]:
reader = Reader(rating_scale = (0.5, 5.0))

train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [6]:
test_data = test_data.build_full_trainset()

In [7]:
test_data = test_data.build_testset()

In [8]:
# final_algo = SVD()

In [9]:
param_grid = {"n_epochs": [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.002, 0.005], "n_factors":[50, 100, 200]}

gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=10, n_jobs=5, joblib_verbose=5)
gs.fit(train_data)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   8 tasks      | elapsed:    2.1s
[Parallel(n_jobs=5)]: Done  62 tasks      | elapsed:   13.3s
[Parallel(n_jobs=5)]: Done 152 tasks      | elapsed:   36.1s
[Parallel(n_jobs=5)]: Done 278 tasks      | elapsed:  1.5min
[Parallel(n_jobs=5)]: Done 360 out of 360 | elapsed:  2.5min finished


In [10]:
print(gs.best_score["rmse"])

0.8836209141546949


In [11]:
print(gs.best_params["rmse"])

{'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.005, 'n_factors': 50}


In [12]:
model = gs.best_estimator["rmse"]

train_data = train_data.build_full_trainset()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x222b18c2790>

In [13]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8688


0.8688040903106153

In [14]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,156,2300,4.0,4.113033,{'was_impossible': False},0.113033
1,156,7137,4.0,3.647628,{'was_impossible': False},0.352372
2,156,969,4.0,4.095559,{'was_impossible': False},0.095559
3,156,1267,4.0,4.271609,{'was_impossible': False},0.271609
4,156,5013,4.5,3.960275,{'was_impossible': False},0.539725


In [15]:
best_predictions = df_predictions.sort_values(by='err')[:10]

In [16]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
18952,498,318,5.0,5.0,{'was_impossible': False},0.0
6122,594,110,5.0,5.0,{'was_impossible': False},0.0
8245,475,1196,5.0,5.0,{'was_impossible': False},0.0
17045,43,1084,5.0,5.0,{'was_impossible': False},0.0
3749,414,720,5.0,5.0,{'was_impossible': False},0.0
14925,348,1208,5.0,5.0,{'was_impossible': False},0.0
18924,276,364,5.0,5.0,{'was_impossible': False},0.0
17062,43,261,5.0,5.0,{'was_impossible': False},0.0
13094,1,2858,5.0,5.0,{'was_impossible': False},0.0
3510,414,1196,5.0,5.0,{'was_impossible': False},0.0


In [17]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]

In [18]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
6121,594,7564,0.5,3.943428,{'was_impossible': False},3.443428
6560,20,4821,0.5,3.95757,{'was_impossible': False},3.45757
6096,594,7116,0.5,4.10319,{'was_impossible': False},3.60319
19056,413,1246,1.0,4.612212,{'was_impossible': False},3.612212
16039,159,7361,0.5,4.115272,{'was_impossible': False},3.615272
19923,598,593,0.5,4.271538,{'was_impossible': False},3.771538
6076,594,7650,0.5,4.288358,{'was_impossible': False},3.788358
14003,419,337,0.5,4.295281,{'was_impossible': False},3.795281
1055,543,35836,0.5,4.377818,{'was_impossible': False},3.877818
14006,419,1097,0.5,4.44663,{'was_impossible': False},3.94663


In [19]:
# Now fitting to full data
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x222b18c2790>

In [20]:
model.qi.shape

(9724, 50)

In [21]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-0.107837,-0.174756,-0.006462,-0.030268,0.045334,0.121168,-0.087292,-0.004558,-0.070474,0.292572,...,-0.020555,0.032787,-0.079020,-0.030973,-0.244811,-0.087144,-0.130291,-0.284058,0.153942,0.066111
1,-0.007329,-0.252623,0.140459,-0.040528,0.164743,0.125501,0.092406,0.125978,-0.038980,0.014750,...,-0.082249,-0.202578,0.227114,-0.153297,-0.020157,-0.039514,-0.207180,-0.353467,0.151877,-0.038713
2,-0.025187,-0.090591,-0.152520,-0.224179,-0.262389,0.062674,0.110300,-0.115775,0.169517,0.137910,...,-0.154199,0.000063,0.206650,-0.058679,-0.103925,0.035518,0.145583,-0.003885,0.083147,0.056514
3,-0.002940,-0.347508,-0.257748,0.032803,0.150315,-0.011614,-0.069179,0.003594,0.188974,0.056565,...,0.149056,0.032749,-0.014926,-0.148811,-0.135054,0.151924,-0.168292,0.132887,0.202814,-0.093731
4,0.172193,-0.149754,-0.026061,-0.162419,0.138898,0.145537,0.202383,0.146722,0.334327,0.035254,...,0.011960,0.063343,-0.082817,0.053301,0.041785,-0.225364,-0.160830,0.016953,0.130911,0.272538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9719,0.054435,0.071506,-0.022684,-0.047734,0.037658,-0.056754,0.086360,0.103402,-0.044078,-0.049704,...,-0.093905,-0.087189,-0.140982,0.339741,0.024611,0.118483,-0.105113,0.017020,-0.042870,-0.024391
9720,0.064039,0.144631,-0.005499,-0.081438,-0.094249,-0.122052,-0.045828,-0.175324,-0.159040,0.119078,...,0.023182,-0.000399,0.157008,0.007001,0.034145,-0.035484,-0.099244,-0.026403,-0.159998,-0.116206
9721,-0.052785,0.021197,-0.069577,-0.199811,-0.129581,0.160871,0.122585,0.021197,0.038211,0.097581,...,0.158128,0.148426,0.293532,-0.153548,0.082991,0.041397,0.037278,0.129741,0.085751,0.103012
9722,-0.115533,-0.048200,0.166024,0.015566,-0.023913,-0.045323,0.099596,-0.058495,0.009821,0.047157,...,0.069082,0.065325,-0.111078,0.011698,0.001266,0.143425,0.024024,-0.007432,-0.153391,-0.152762


In [22]:
# Dumping to file
pickle.dump(model, open("svd_model.sav", "wb"))