In [1]:
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.co_clustering import CoClustering
from surprise.prediction_algorithms.matrix_factorization import SVDpp, NMF
from surprise.prediction_algorithms.slope_one import SlopeOne
%matplotlib inline
import pandas as pd
from surprise import Reader, Dataset, SVD, BaselineOnly, KNNWithZScore, KNNWithMeans, KNNBasic, KNNBaseline, \
    NormalPredictor
from surprise.model_selection import GridSearchCV

import warnings; warnings.simplefilter('ignore')

In [2]:
ratings = pd.read_csv('data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


In [4]:
%%time
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(),
    KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8979  0.9038  0.9011  0.8898  0.8970  0.8979  0.0047  
MAE (testset)     0.6919  0.6974  0.6934  0.6844  0.6916  0.6918  0.0042  
Fit time          4.88    6.22    5.43    5.61    4.80    5.39    0.52    
Test time         0.12    0.13    0.18    0.20    0.14    0.15    0.03    
Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8916  0.8853  0.8789  0.8957  0.8882  0.8879  0.0057  
MAE (testset)     0.6830  0.6798  0.6749  0.6896  0.6826  0.6820  0.0048  
Fit time          481.07  472.32  461.86  459.64  488.04  472.58  10.90   
Test time         8.58    7.48    7.20    7.86    7.80    7.78    0.47    
Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (

In [5]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVDpp,0.887918,0.68199,472.583791,7.783179
BaselineOnly,0.892657,0.68965,0.259078,0.144393
KNNBaseline,0.897219,0.687397,0.504314,1.865113
SVD,0.897924,0.691752,5.389318,0.15471
KNNWithZScore,0.917748,0.698225,0.362909,1.793558
KNNWithMeans,0.918104,0.703168,0.286784,1.63407
SlopeOne,0.927468,0.709958,4.14983,5.341648
NMF,0.947619,0.728915,6.643606,0.171304
CoClustering,0.964438,0.747635,3.204766,0.195114
KNNBasic,0.968976,0.744604,0.267285,1.529478


In [6]:
%%time
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=10, verbose=False)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Wall time: 4.95 s


{'test_rmse': array([0.88865909, 0.89699849, 0.88852134, 0.87520392, 0.89113395,
        0.87528139, 0.87692002, 0.88275517, 0.88231951, 0.89299806]),
 'fit_time': (0.26410436630249023,
  0.2948901653289795,
  0.2942817211151123,
  0.28585362434387207,
  0.2858448028564453,
  0.29285454750061035,
  0.2968323230743408,
  0.29787278175354004,
  0.28786230087280273,
  0.2877511978149414),
 'test_time': (0.040979623794555664,
  0.03897237777709961,
  0.11295437812805176,
  0.042975664138793945,
  0.040976524353027344,
  0.04499173164367676,
  0.03997921943664551,
  0.03697967529296875,
  0.038980722427368164,
  0.03950858116149902)}

In [7]:
%%time
param_grid = {
    "n_epochs": [5,10, 20, 30, 50, 80, 100],
    "lr_all": [0.001,0.002, 0.005, 0.009],
    "reg_all": [0.001, 0.02, 0.09, 0.5]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], refit=True,n_jobs=-1, cv=10,joblib_verbose=True)

gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 21.7min
[Parallel(n_jobs=-1)]: Done 1120 out of 1120 | elapsed: 56.7min finished


Wall time: 57min 15s


In [8]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,split5_test_rmse,split6_test_rmse,split7_test_rmse,split8_test_rmse,split9_test_rmse,...,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.963482,0.942229,0.951265,0.964312,0.945221,0.958385,0.956445,0.949648,0.956811,0.953483,...,0.004801,106,3.050577,0.119198,0.158708,0.004810,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0....",5,0.001,0.001
1,0.960335,0.941914,0.953888,0.964267,0.943684,0.955906,0.956693,0.947078,0.955582,0.952544,...,0.004814,104,3.024528,0.149129,0.156874,0.025786,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.02}",5,0.001,0.020
2,0.962476,0.940424,0.953007,0.964841,0.944222,0.957116,0.955323,0.948746,0.955813,0.953619,...,0.004790,105,3.023793,0.135014,0.156278,0.025007,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.09}",5,0.001,0.090
3,0.969583,0.949284,0.959032,0.971650,0.952350,0.965652,0.961179,0.956273,0.962091,0.961362,...,0.004353,107,2.877192,0.192811,0.131497,0.010385,"{'n_epochs': 5, 'lr_all': 0.001, 'reg_all': 0.5}",5,0.001,0.500
4,0.942629,0.920296,0.930950,0.944367,0.923618,0.936332,0.933964,0.929631,0.935851,0.933943,...,0.005210,96,2.746465,0.139106,0.139002,0.020783,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0....",5,0.002,0.001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,0.908757,0.889215,0.900027,0.912107,0.890983,0.905082,0.903491,0.895869,0.905099,0.901598,...,0.005035,49,55.195721,0.780145,0.140088,0.009166,"{'n_epochs': 100, 'lr_all': 0.005, 'reg_all': ...",100,0.005,0.500
108,0.996109,0.983711,0.988611,1.000708,0.986698,0.996554,0.988462,0.980532,0.987293,0.989977,...,0.004149,111,54.916156,0.326055,0.139320,0.013979,"{'n_epochs': 100, 'lr_all': 0.009, 'reg_all': ...",100,0.009,0.001
109,0.903148,0.893624,0.903736,0.901988,0.886325,0.898372,0.901547,0.890416,0.902328,0.903099,...,0.004706,28,55.434909,0.360161,0.138720,0.009972,"{'n_epochs': 100, 'lr_all': 0.009, 'reg_all': ...",100,0.009,0.020
110,0.877314,0.861069,0.869496,0.876335,0.858480,0.870616,0.873667,0.861636,0.870500,0.864669,...,0.004597,3,55.824836,0.522340,0.143617,0.011741,"{'n_epochs': 100, 'lr_all': 0.009, 'reg_all': ...",100,0.009,0.090


In [9]:
training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.8677772476305222
BEST MAE: 	 0.6656959141592061
BEST params: 	 {'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.09}


In [10]:
training_parameters

{'n_epochs': 100, 'lr_all': 0.005, 'reg_all': 0.09}

In [11]:
%%time
trainset = data.build_full_trainset()
msvd = SVD(n_epochs = training_parameters['n_epochs'], lr_all = training_parameters['lr_all'], reg_all = training_parameters['reg_all'], verbose=True)
msvd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1f292088550>

check prediction

In [12]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [13]:
msvd.predict(1, 3671, 3)


Prediction(uid=1, iid=3671, r_ui=3, est=3.1523608242169554, details={'was_impossible': False})