In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD
import pickle, gzip, pickletools


In [2]:
df = pd.read_csv("ratings.csv").drop(["timestamp"], axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [3]:
df1 = df.drop(["rating"], axis=1)


In [4]:
df1 = pd.DataFrame(df1.groupby(["movieId"]).count())

In [5]:
df1

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
1,57309
2,24228
3,11804
4,2523
5,11714
...,...
209157,1
209159,1
209163,1
209169,1


In [6]:
df1 = df1.sort_values("userId", ascending=False)

In [7]:
df1

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
356,81491
318,81482
296,79672
593,74127
2571,72674
...,...
167308,1
167326,1
167336,1
167338,1


In [8]:
df2 = df1.loc[df1["userId"] < 100]

In [9]:
df2

Unnamed: 0_level_0,userId
movieId,Unnamed: 1_level_1
51044,99
2487,99
58783,99
32383,99
197,99
...,...
167308,1
167326,1
167336,1
167338,1


In [10]:
obscure = df2.index.to_list()

In [11]:
# obscure

In [12]:
index_names = df[df['movieId'].isin(obscure)].index
df = df.drop(index_names)

In [13]:
df

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [14]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, testset = train_test_split(df, train_size=0.999)

print("Training set size: ", trainset.shape)


print("Test set size: ", testset.shape)

Training set size:  (24418936, 3)
Test set size:  (24444, 3)


In [15]:
reader = Reader(rating_scale = (0.5, 5.0))

train_data = Dataset.load_from_df(trainset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [16]:
test_data = test_data.build_full_trainset()

In [17]:
test_data = test_data.build_testset()

In [18]:
# Train a SVD model with 150 latent features
model = SVD(n_factors=200, n_epochs=30)

In [19]:
# param_grid = {"n_epochs": [5, 10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.002, 0.005], "n_factors":[50, 100, 200]}

# gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=10, n_jobs=5, joblib_verbose=5)
# gs.fit(train_data)

In [20]:
# print(gs.best_score["rmse"])

In [21]:
# print(gs.best_params["rmse"])

In [22]:
# model = gs.best_estimator["rmse"]

train_data = train_data.build_full_trainset()
model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x205b99579a0>

In [23]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = model.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.7577


0.7577109171647574

In [24]:
df_predictions = pd.DataFrame(pred_test, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_predictions['err'] = abs(df_predictions.est - df_predictions.rui)

df_predictions.head()

Unnamed: 0,uid,iid,rui,est,details,err
0,6130,2803,3.0,3.088564,{'was_impossible': False},0.088564
1,6130,487,3.0,2.183106,{'was_impossible': False},0.816894
2,6130,1244,4.0,4.198336,{'was_impossible': False},0.198336
3,130011,8799,0.5,2.423675,{'was_impossible': False},1.923675
4,81473,266,3.0,2.888985,{'was_impossible': False},0.111015


In [25]:
best_predictions = df_predictions.sort_values(by='err')[:10]

In [26]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
21207,128762,92259,5.0,5.0,{'was_impossible': False},0.0
1794,159610,1704,5.0,5.0,{'was_impossible': False},0.0
1796,27525,5610,0.5,0.5,{'was_impossible': False},0.0
20504,159865,1148,5.0,5.0,{'was_impossible': False},0.0
7681,54670,68157,5.0,5.0,{'was_impossible': False},0.0
17304,70956,1197,5.0,5.0,{'was_impossible': False},0.0
14858,57058,1196,5.0,5.0,{'was_impossible': False},0.0
17187,103814,5952,5.0,5.0,{'was_impossible': False},0.0
2850,139999,1230,5.0,5.0,{'was_impossible': False},0.0
17353,151115,7153,5.0,5.0,{'was_impossible': False},0.0


In [27]:
worst_predictions = df_predictions.sort_values(by='err')[-10:]

In [28]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
2635,24152,329,1.0,4.526567,{'was_impossible': False},3.526567
11551,9428,5617,1.0,4.55811,{'was_impossible': False},3.55811
22742,29703,66371,1.0,4.572905,{'was_impossible': False},3.572905
13292,6005,162606,0.5,4.107195,{'was_impossible': False},3.607195
23199,142852,2599,1.0,4.633018,{'was_impossible': False},3.633018
19775,54098,168252,0.5,4.154958,{'was_impossible': False},3.654958
23843,100457,142509,0.5,4.214077,{'was_impossible': False},3.714077
20175,113753,1285,0.5,4.321723,{'was_impossible': False},3.821723
3144,92865,3347,0.5,4.475555,{'was_impossible': False},3.975555
11585,113989,6539,0.5,4.611345,{'was_impossible': False},4.111345


In [29]:
# Now fitting to full data (excluding movies with < 100 ratings)
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x205b99579a0>

In [30]:
model.qi.shape

(10326, 200)

In [31]:
pd.DataFrame(model.qi)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.092911,-0.064177,-0.191643,-0.143336,0.027168,0.227176,-0.156938,-0.126235,0.079970,0.158107,...,0.239507,-0.144732,0.182517,-0.250486,-0.080231,0.295197,-0.178140,-0.043730,0.141406,0.228242
1,0.155719,0.006572,-0.173806,-0.467705,-0.133925,0.009027,-0.237722,-0.072540,0.173588,0.418212,...,-0.003281,-0.072620,-0.095393,0.005335,0.128289,-0.211230,-0.029933,-0.110155,0.161082,-0.165910
2,0.067867,0.131585,-0.181855,-0.392417,0.012519,-0.008601,-0.323412,-0.029370,0.169489,0.536339,...,0.056961,-0.070830,-0.079441,0.031269,0.104580,-0.201259,-0.080997,-0.068325,0.208108,-0.214221
3,0.086705,-0.200623,0.127798,0.201543,-0.274319,0.247829,0.321979,0.157721,0.093206,0.170906,...,0.257186,-0.097709,-0.034014,-0.176692,-0.267322,0.130085,0.046935,0.098134,0.227614,-0.113436
4,0.027162,-0.053153,0.248671,0.014023,-0.156786,0.019172,0.106355,-0.079683,-0.054304,-0.078932,...,0.089938,0.025405,0.116515,0.179420,-0.136900,-0.059179,-0.133654,0.209362,-0.167199,-0.310286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10321,-0.133377,0.123795,-0.094611,-0.209561,0.238845,0.047246,-0.049117,0.008699,-0.048034,-0.014302,...,-0.053197,-0.043657,0.157117,-0.021411,0.121109,-0.043007,0.111063,-0.035564,-0.068555,0.128844
10322,0.033434,-0.034879,-0.033346,0.104356,0.040119,-0.019659,-0.191097,-0.002892,0.068158,-0.041645,...,-0.139047,0.035186,-0.029097,0.023629,0.058487,0.021235,0.069953,-0.182462,0.000644,-0.037520
10323,-0.153988,-0.071133,0.038423,-0.011384,0.052243,-0.013778,-0.069157,-0.108385,-0.064169,0.221172,...,-0.202419,-0.087412,0.105667,0.204611,0.071265,-0.029253,0.129750,0.099976,0.043406,0.036265
10324,-0.020823,-0.291100,0.133457,-0.095066,0.036794,-0.247820,0.030986,0.069741,-0.037520,-0.258988,...,-0.065199,-0.067630,0.085485,0.049197,0.316060,-0.151204,0.154312,-0.139390,0.041974,0.331698


In [32]:
# Dumping model to file
with gzip.open("svd_model_200.h5", "wb") as f:
    pickled = pickle.dumps(model)
    optimized_pickle = pickletools.optimize(pickled)
    f.write(optimized_pickle)