In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD
from surprise.model_selection import GridSearchCV
import pickle

In [2]:
df = pd.read_csv("ratings.csv").drop(["timestamp"], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [3]:
df1 = df.drop(["rating"], axis=1)


In [4]:
df1 = pd.DataFrame(df1.groupby(["movieId"]).count())

In [5]:
df1

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
1,57309
2,24228
3,11804
4,2523
5,11714
...,...
209157,1
209159,1
209163,1
209169,1


In [6]:
df1 = df1.sort_values("userId", ascending=False)

In [7]:
df1

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
356,81491
318,81482
296,79672
593,74127
2571,72674
...,...
167308,1
167326,1
167336,1
167338,1


In [8]:
df2 = df1.loc[df1["userId"] < 100]

In [9]:
df2

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
51044,99
2487,99
58783,99
32383,99
197,99
...,...
167308,1
167326,1
167336,1
167338,1


In [10]:
obscure = df2.index.to_list()

In [11]:
obscure

[51044,
 2487,
 58783,
 32383,
 197,
 60295,
 59129,
 6404,
 26523,
 116219,
 8903,
 7391,
 7770,
 8040,
 32076,
 31347,
 91355,
 82765,
 7207,
 2855,
 168806,
 7523,
 26271,
 6562,
 106144,
 8259,
 97757,
 1174,
 4271,
 34767,
 81910,
 106839,
 68442,
 4907,
 794,
 38198,
 171207,
 136477,
 44189,
 32349,
 99087,
 32261,
 165087,
 962,
 26915,
 286,
 77233,
 117893,
 790,
 53835,
 25762,
 8631,
 5862,
 26379,
 146662,
 54193,
 641,
 142058,
 179811,
 86922,
 1898,
 109472,
 7948,
 5406,
 30791,
 66547,
 2281,
 4335,
 27744,
 26325,
 3848,
 169756,
 3313,
 4877,
 165119,
 55566,
 65899,
 55110,
 4108,
 3472,
 89337,
 25962,
 62788,
 4093,
 5985,
 130052,
 8194,
 59721,
 167248,
 93212,
 5496,
 99537,
 1850,
 107130,
 131080,
 7306,
 6760,
 129657,
 8258,
 33264,
 48543,
 183227,
 98797,
 43987,
 8382,
 125926,
 45679,
 44815,
 53574,
 55156,
 122940,
 96991,
 5880,
 171007,
 101186,
 173161,
 5699,
 7474,
 8986,
 27729,
 7578,
 71926,
 5858,
 26138,
 133219,
 106163,
 7357,
 26792,
 13

In [12]:
index_names = df[df['movieId'].isin(obscure)].index
df = df.drop(index_names)

In [13]:
df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [15]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, train_size=0.999)

print("Training set size: ", trainset.shape)


print("Test set size: ", testset.shape)

Training set size:  (24418936, 3)
Test set size:  (24444, 3)


In [16]:
reader = Reader(rating_scale = (0.5, 5.0))

train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [25]:
test_data = test_data.build_full_trainset()

In [26]:
test_data = test_data.build_testset()

In [19]:
# Train a SVD model with 200 latent features
model = SVD(n_factors=200, n_epochs=30)

In [20]:
# param_grid = {"n_epochs": [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.002, 0.005], "n_factors":[50, 100, 200]}

# gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=10, n_jobs=5, joblib_verbose=5)
# gs.fit(train_data)

In [21]:
# print(gs.best_score["rmse"])

In [22]:
# print(gs.best_params["rmse"])

In [23]:
# model = gs.best_estimator["rmse"]

train_data = train_data.build_full_trainset()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23687605eb0>

In [27]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.7554


0.7553601333132435

In [28]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,3918,6708,3.0,3.355729,{'was_impossible': False},0.355729
1,33885,1792,3.5,3.811788,{'was_impossible': False},0.311788
2,33885,47610,2.5,4.089536,{'was_impossible': False},1.589536
3,32505,1397,5.0,4.857956,{'was_impossible': False},0.142044
4,117032,2987,5.0,4.228689,{'was_impossible': False},0.771311


In [29]:
best_predictions = df_predictions.sort_values(by='err')[:10]

In [30]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
7752,152609,318,5.0,5.0,{'was_impossible': False},0.0
20584,79688,858,5.0,5.0,{'was_impossible': False},0.0
13764,17705,111,5.0,5.0,{'was_impossible': False},0.0
4032,115816,98491,5.0,5.0,{'was_impossible': False},0.0
10978,119450,111,5.0,5.0,{'was_impossible': False},0.0
21582,24165,750,5.0,5.0,{'was_impossible': False},0.0
5290,133298,5618,5.0,5.0,{'was_impossible': False},0.0
17247,61719,260,5.0,5.0,{'was_impossible': False},0.0
2071,61987,185,5.0,5.0,{'was_impossible': False},0.0
229,108013,1198,5.0,5.0,{'was_impossible': False},0.0


In [31]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]

In [32]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
23904,45889,6942,0.5,3.9969,{'was_impossible': False},3.4969
13696,141187,112852,0.5,3.998062,{'was_impossible': False},3.498062
20861,48991,36,1.0,4.605426,{'was_impossible': False},3.605426
4835,80227,527,0.5,4.134011,{'was_impossible': False},3.634011
4723,27290,2791,0.5,4.150043,{'was_impossible': False},3.650043
24303,9375,48,1.0,4.745005,{'was_impossible': False},3.745005
16608,122179,27790,0.5,4.299598,{'was_impossible': False},3.799598
4144,160705,1682,0.5,4.410522,{'was_impossible': False},3.910522
23801,148278,113252,0.5,4.625091,{'was_impossible': False},4.125091
16265,93768,150,0.5,4.667673,{'was_impossible': False},4.167673


In [33]:
# Now fitting to full data (excluding movies with < 100 ratings)
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23687605eb0>

In [34]:
model.qi.shape

(10326, 200)

In [35]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.355428,0.086711,0.037307,-0.008528,0.003601,0.062866,0.309012,-0.193954,0.255879,0.075461,...,0.151596,-0.052066,0.026084,-0.180659,0.058057,-0.262858,-0.022482,-0.079360,-0.180894,-0.168850
1,0.039600,0.221500,-0.121480,-0.086343,0.092774,-0.186041,0.039444,-0.327360,-0.186458,0.122287,...,0.104986,-0.148422,-0.020508,0.061155,0.036455,-0.112470,-0.079350,0.345237,-0.280915,-0.056840
2,0.035553,0.296449,-0.052560,-0.096478,0.146144,-0.215660,0.165634,-0.238519,-0.154251,0.202097,...,0.092361,-0.263528,0.009131,0.068094,0.091665,0.028335,-0.060514,0.321258,-0.307469,0.042559
3,-0.052684,0.124052,-0.233798,-0.048548,0.283924,-0.002556,0.208307,-0.004916,-0.117665,0.352862,...,-0.021665,-0.026972,-0.211693,0.036533,0.510261,-0.042089,-0.314043,-0.297400,-0.128107,0.056825
4,-0.113388,0.112877,-0.223308,0.047166,0.287559,-0.182320,0.005369,-0.031051,-0.112730,-0.010081,...,-0.085853,-0.155829,-0.317962,-0.020747,-0.125405,0.170290,-0.004097,-0.137421,0.105730,0.033495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10321,0.064283,-0.066679,0.019670,-0.074904,-0.111898,0.032758,0.007110,-0.237712,0.003386,0.104792,...,0.075875,0.037610,0.044629,0.022485,0.072397,0.073447,-0.038311,0.105090,-0.219770,-0.064506
10322,-0.028149,-0.237368,-0.127044,0.112757,-0.053829,0.175344,0.077252,-0.021320,-0.135519,0.055556,...,0.073472,-0.068247,0.036520,0.144946,0.175616,0.174793,-0.046194,0.096464,0.095503,0.009346
10323,0.019206,0.045777,0.037523,-0.003447,0.081681,-0.028402,-0.083033,0.000999,-0.038110,0.123682,...,0.041664,0.070854,-0.183184,0.075146,-0.126006,-0.009568,0.081285,-0.017439,-0.058702,0.149930
10324,0.155823,-0.050094,-0.100647,0.069713,-0.116233,0.165296,0.029190,0.042868,-0.198549,-0.038005,...,0.076845,-0.066523,-0.074670,0.330238,0.094218,0.220379,0.030029,0.103854,-0.176127,-0.235026


In [36]:
# Dumping to file
pickle.dump(model, open("svd_model_final.sav", "wb"))