In [15]:
from surprise import accuracy, Dataset, Reader, SVD, SVDpp, NMF
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import pandas as pd




In [17]:
data = Dataset.load_builtin("ml-100k")

# GridSearchCV SVD

In [18]:
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.9634906452210815
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [21]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.989323,1.001932,0.999756,0.997003,0.005503,7,0.800608,0.80932,0.807749,0.805892,0.003791,7,0.500965,0.017017,0.542809,0.188977,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,0.995796,1.008109,1.005614,1.003173,0.005315,8,0.809412,0.818157,0.816146,0.814572,0.00374,8,0.438419,0.035798,0.500206,0.038487,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.965481,0.978188,0.976711,0.97346,0.005674,3,0.776563,0.785055,0.783886,0.781835,0.003758,2,0.43971,0.037637,0.444526,0.040655,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.974642,0.986934,0.985094,0.982224,0.005413,5,0.787649,0.795814,0.794688,0.792717,0.003613,5,0.517233,0.162448,0.473536,0.076238,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.969639,0.982688,0.981094,0.977807,0.005812,4,0.780337,0.789504,0.788259,0.786033,0.00406,4,0.915468,0.068451,0.434872,0.058091,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4


In [5]:
# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# GridSearchCV SVD++

In [22]:
param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs_pp = GridSearchCV(SVDpp, param_grid, measures=["rmse", "mae"], cv=3)

gs_pp.fit(data)

# best RMSE score
print(gs_pp.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_pp.best_params["rmse"])

0.9636619947212749
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [23]:
# We can now use the algorithm that yields the best rmse:
algo_gs_pp = gs_pp.best_estimator["rmse"]
algo_gs_pp.fit(data.build_full_trainset())
results_gs_pp = pd.DataFrame.from_dict(gs_pp.cv_results)
results_gs_pp.head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,0.996008,0.995451,0.997613,0.996358,0.000916,7,0.804069,0.805416,0.807242,0.805576,0.0013,7,3.108607,0.30201,6.792455,0.718068,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,1.002745,1.001535,1.00413,1.002804,0.00106,8,0.813155,0.813967,0.816497,0.814539,0.001423,8,2.634049,0.068624,7.342469,0.293712,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,0.973336,0.972218,0.975243,0.973599,0.001249,2,0.780449,0.781651,0.783495,0.781865,0.001253,2,3.11947,0.445093,7.473377,0.435385,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,0.982504,0.980882,0.984161,0.982516,0.001339,5,0.791761,0.792273,0.794834,0.792956,0.001345,5,3.01582,0.178086,8.722431,1.00105,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,0.977423,0.976415,0.97905,0.97763,0.001086,4,0.784404,0.785566,0.787217,0.785729,0.001154,4,6.117775,0.291059,7.421879,0.759565,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4


# GridSearchCV NMF

In [26]:
param_grid = {'n_factors': [1,2,3,4,5,6,7,8,9,10], 'n_epochs': [100], 'biased': [True], 'reg_bu': [0.1], 'reg_bi': [0.1]}
gs_nmf = GridSearchCV(NMF, param_grid, measures=["rmse", "mae"], cv=3)

gs_nmf.fit(data)

# best RMSE score
print(gs_nmf.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_nmf.best_params["rmse"])

0.9379415639710048
{'n_factors': 2, 'n_epochs': 100, 'biased': True, 'reg_bu': 0.1, 'reg_bi': 0.1}


In [27]:
# We can now use the algorithm that yields the best rmse:
algo_nmf = gs_nmf.best_estimator["rmse"]
algo_nmf.fit(data.build_full_trainset())
results_nmf = pd.DataFrame.from_dict(gs_nmf.cv_results)
results_nmf.head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_biased,param_reg_bu,param_reg_bi
0,0.935725,0.940408,0.941088,0.939074,0.002384,3,0.740412,0.743073,0.744422,0.742636,...,6.206709,0.420034,0.721528,0.254195,"{'n_factors': 1, 'n_epochs': 100, 'biased': Tr...",1,100,True,0.1,0.1
1,0.935038,0.940056,0.93873,0.937942,0.002123,1,0.739419,0.742671,0.742474,0.741521,...,6.207668,0.121773,0.527097,0.147573,"{'n_factors': 2, 'n_epochs': 100, 'biased': Tr...",2,100,True,0.1,0.1
2,0.93504,0.944455,0.937812,0.939102,0.00395,4,0.737685,0.745946,0.74089,0.741507,...,6.035842,0.059377,0.541258,0.121989,"{'n_factors': 3, 'n_epochs': 100, 'biased': Tr...",3,100,True,0.1,0.1
3,0.938009,0.941268,0.941592,0.940289,0.001618,9,0.740117,0.742337,0.744592,0.742349,...,6.210532,0.113246,0.562703,0.17328,"{'n_factors': 4, 'n_epochs': 100, 'biased': Tr...",4,100,True,0.1,0.1
4,0.935698,0.944406,0.938449,0.939517,0.003634,6,0.737622,0.745245,0.741839,0.741568,...,6.420352,0.318132,0.629465,0.154848,"{'n_factors': 5, 'n_epochs': 100, 'biased': Tr...",5,100,True,0.1,0.1


# SVD algorithm

In [6]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
SVD_result = cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
SVD_result = pd.DataFrame.from_dict(SVD_result).mean(axis=0)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9342  0.9407  0.9331  0.9373  0.9395  0.9370  0.0029  
MAE (testset)     0.7374  0.7405  0.7362  0.7370  0.7421  0.7386  0.0023  
Fit time          1.57    1.69    1.54    1.55    1.55    1.58    0.06    
Test time         0.18    0.18    0.30    0.20    0.23    0.22    0.05    


In [7]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
SVD_rmse = accuracy.rmse(predictions)

RMSE: 0.9436


# SVD++ algorithm

In [8]:
# We'll use the famous SVD++ algorithm.
algo_pp = SVDpp()

# Run 5-fold cross-validation and print results
SVDpp_result = cross_validate(algo_pp, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
SVDpp_result = pd.DataFrame.from_dict(SVDpp_result).mean(axis=0)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9229  0.9244  0.9218  0.9133  0.9185  0.9202  0.0040  
MAE (testset)     0.7224  0.7251  0.7228  0.7185  0.7214  0.7221  0.0021  
Fit time          10.26   9.78    9.17    9.18    9.08    9.49    0.46    
Test time         3.56    3.22    3.17    3.38    3.20    3.31    0.15    


In [9]:
algo_pp.fit(trainset)
predictions_pp = algo_pp.test(testset)

# Then compute RMSE
SVD_pp_rmse = accuracy.rmse(predictions_pp)

RMSE: 0.9274


# NMF algorithm

In [10]:
# We'll use the famous NMF algorithm.
algo_nmf = NMF()

# Run 5-fold cross-validation and print results
NMF_result = cross_validate(algo_nmf, data, measures=["RMSE", "MAE"], cv=5, verbose=True)
NMF_result = pd.DataFrame.from_dict(NMF_result).mean(axis=0)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9629  0.9607  0.9698  0.9617  0.9571  0.9624  0.0042  
MAE (testset)     0.7569  0.7546  0.7603  0.7552  0.7533  0.7561  0.0024  
Fit time          3.74    3.30    3.10    3.24    4.03    3.48    0.35    
Test time         0.35    0.17    0.32    0.16    0.17    0.23    0.08    


In [11]:
algo_nmf.fit(trainset)
predictions_nmf = algo_nmf.test(testset)

# Then compute RMSE
NMF_rmse = accuracy.rmse(predictions_nmf)

RMSE: 0.9738


# Surprise results

In [12]:
surprise_results = pd.DataFrame(columns=['SVD', 'SVDpp', 'NMF'])
surprise_results['SVD'] = SVD_result
surprise_results['SVDpp'] = SVDpp_result
surprise_results['NMF'] = NMF_result
surprise_results

Unnamed: 0,SVD,SVDpp,NMF
test_rmse,0.936965,0.920184,0.962417
test_mae,0.738632,0.722052,0.756068
fit_time,1.577313,9.494595,3.48276
test_time,0.218775,3.30589,0.232422


In [29]:
print('RMSE:')
print('SVD:', SVD_rmse)
print('SVDpp:', SVD_pp_rmse)
print('NMF:', NMF_rmse)


RMSE:
SVD: 0.9436303697877875
SVDpp: 0.9274014325427109
NMF: 0.9738416935330341


In [30]:
print('Best GridSearchCV RMSE score:')
print('SVD:', gs.best_score["rmse"])
print('SVDpp:', gs_pp.best_score["rmse"])
print('NMF:', gs_nmf.best_score["rmse"])

Best GridSearchCV RMSE score:
SVD: 0.9634906452210815
SVDpp: 0.9636619947212749
NMF: 0.9379415639710048
