In [None]:
#!pip install scikit-surprise
import pandas as pd
from surprise import accuracy, Dataset, SVD, SVDpp, NMF, KNNBasic
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [4]:
data = Dataset.load_builtin("ml-100k")

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [5]:
# SVD -------------------------------------------------------------------------------
algo = SVD()

cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9342  0.9355  0.9275  0.9428  0.9436  0.9367  0.0060  
MAE (testset)     0.7354  0.7390  0.7311  0.7424  0.7441  0.7384  0.0047  
Fit time          1.34    1.55    1.44    1.37    1.39    1.42    0.07    
Test time         0.25    0.12    0.12    0.20    0.13    0.16    0.05    


{'test_rmse': array([0.93415339, 0.93554388, 0.9274801 , 0.94284361, 0.94355131]),
 'test_mae': array([0.73540192, 0.7389933 , 0.73109374, 0.74237035, 0.74411533]),
 'fit_time': (1.3441052436828613,
  1.5471611022949219,
  1.4436888694763184,
  1.369858980178833,
  1.388780117034912),
 'test_time': (0.24746155738830566,
  0.12340593338012695,
  0.11972427368164062,
  0.19513726234436035,
  0.13400745391845703)}

In [6]:
trainset, testset = train_test_split(data, test_size=0.25)
algo.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 0.9361


0.936109944732446

In [14]:
param_grid = {'n_epochs': [5, 50], 'lr_all': [0.002, 0.005],'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(f"best RMSE score - {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"best RMSE param - {gs.best_params['rmse']}")

best RMSE score - 0.9559716211555815
best RMSE param - {'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.4}


In [8]:
# SVDpp -----------------------------------------------------------------------------
svdpp = SVDpp()
cross_validate(svdpp, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9226  0.9253  0.9219  0.9194  0.9070  0.9192  0.0064  
MAE (testset)     0.7223  0.7258  0.7213  0.7220  0.7144  0.7212  0.0037  
Fit time          26.89   26.85   26.99   26.91   29.13   27.36   0.89    
Test time         4.22    5.37    4.37    4.84    4.10    4.58    0.47    


{'test_rmse': array([0.9226028 , 0.9253447 , 0.92186819, 0.91942397, 0.90696834]),
 'test_mae': array([0.72225582, 0.72578819, 0.72134955, 0.72204789, 0.7144013 ]),
 'fit_time': (26.89334750175476,
  26.852745294570923,
  26.99207043647766,
  26.914382934570312,
  29.128947973251343),
 'test_time': (4.22034478187561,
  5.365454196929932,
  4.372200965881348,
  4.836227893829346,
  4.097743511199951)}

In [9]:
trainset, testset = train_test_split(data, test_size=0.25)
svdpp.fit(trainset)
predictions = algo.test(testset)

accuracy.rmse(predictions)

RMSE: 0.7534


0.7533676670280625

In [15]:
param_grid = {'n_epochs': [5, 50], 'lr_all': [0.002, 0.005],'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(f"best RMSE score - {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"best RMSE param - {gs.best_params['rmse']}")

best RMSE score - 0.9560714790576093
best RMSE param - {'n_epochs': 50, 'lr_all': 0.005, 'reg_all': 0.4}


In [11]:
# NMF -------------------------------------------------------------------------------
nmf = NMF()
cross_validate(nmf, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9539  0.9593  0.9701  0.9766  0.9619  0.9643  0.0080  
MAE (testset)     0.7509  0.7552  0.7614  0.7681  0.7560  0.7583  0.0059  
Fit time          2.07    1.98    1.98    1.99    2.53    2.11    0.21    
Test time         0.22    0.10    0.10    0.21    0.17    0.16    0.05    


{'test_rmse': array([0.95387291, 0.95929892, 0.9700668 , 0.97655325, 0.96186338]),
 'test_mae': array([0.75094174, 0.75518315, 0.76144103, 0.76806621, 0.75599673]),
 'fit_time': (2.065770387649536,
  1.9828710556030273,
  1.976987600326538,
  1.993039846420288,
  2.529616117477417),
 'test_time': (0.21992921829223633,
  0.09945225715637207,
  0.09922266006469727,
  0.2114109992980957,
  0.17011785507202148)}

In [12]:
trainset, testset = train_test_split(data, test_size=0.25)
svdpp.fit(trainset)
predictions = nmf.test(testset)

accuracy.rmse(predictions)

RMSE: 0.8364


0.8363984049652594

In [18]:
param_grid = {'n_factors': [5, 10], 'n_epochs': [5, 50], 'reg_pu': [0.002, 0.005], 'reg_qi': [0.06, 0.1]}
gs = GridSearchCV(NMF, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(f"best RMSE score - {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"best RMSE param - {gs.best_params['rmse']}")

best RMSE score - 1.0310833772951211
best RMSE param - {'n_factors': 10, 'n_epochs': 50, 'reg_pu': 0.005, 'reg_qi': 0.1}


In [None]:
# KNNBasic --------------------------------------------------------------------
knn = KNNBasic()
cross_validate(knn, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9793  0.9747  0.9769  0.9799  0.9791  0.9780  0.0019  
MAE (testset)     0.7719  0.7712  0.7723  0.7751  0.7732  0.7728  0.0014  
Fit time          0.37    0.36    0.39    0.32    0.34    0.36    0.02    
Test time         3.05    3.59    4.39    3.07    3.09    3.44    0.52    


{'test_rmse': array([0.97930196, 0.9746804 , 0.97690548, 0.97993145, 0.97908271]),
 'test_mae': array([0.7718809 , 0.77123995, 0.77233602, 0.77513338, 0.77323264]),
 'fit_time': (0.37160801887512207,
  0.35668396949768066,
  0.39034438133239746,
  0.3226499557495117,
  0.3432793617248535),
 'test_time': (3.0484306812286377,
  3.5888543128967285,
  4.389488697052002,
  3.070319890975952,
  3.093123435974121)}

In [None]:
trainset, testset = train_test_split(data, test_size=0.25)
svdpp.fit(trainset)
predictions = knn.test(testset)

accuracy.rmse(predictions)

RMSE: 0.8095


0.8094532587777185

In [19]:
param_grid = {'n_epochs': [5, 50], 'lr_all': [0.002, 0.005],'reg_all': [0.4, 0.6]}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(f"best RMSE score - {gs.best_score['rmse']}")

# combination of parameters that gave the best RMSE score
print(f"best RMSE param - {gs.best_params['rmse']}")

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi