In [1]:
from surprise import Dataset,Reader 
from surprise import SVD,KNNBaseline
from surprise.model_selection import KFold, cross_validate
from surprise.model_selection import GridSearchCV,RandomizedSearchCV
import numpy as np
import pandas as pd
import time
import os

In [2]:
#load dataset1
file_path = os.path.expanduser('./ratings_1.csv')
print("Loading Dataset1...")
reader = Reader(line_format='user item rating', sep=',', rating_scale=[1, 5], skip_lines=1)
data1 = Dataset.load_from_file(file_path, reader=reader)
print("Done.")

Loading Dataset1...
Done.


In [3]:
#load dataset2
file_path = os.path.expanduser('./ratings_2.csv')
print("Loading Dataset2...")
reader2 = Reader(line_format='user item rating', sep=',', rating_scale=[1, 10], skip_lines=1)
data2 = Dataset.load_from_file(file_path, reader=reader2)
print("Done.")

Loading Dataset2...
Done.


## Performing a Random-Search-Cross-Validation process for tuning the hyper-parameter of the KNNBaseline algorithm

In [79]:
def tuning_KNNBaseline(data): 
    param_grid = { 
                "k":[5,10,20,25], #[20,40,50,60], 
                "min_k":[1,5,7,9,13],#[1,3,5,8,11,13,15,18,21],
                "sim_options":{
                               'name': ["cosine","pearson_baseline"],  
                               'user_based': [True, False],  
                               'min_support': [1,5,10,12]#[3,5,8,11,13]
                                },
                "bsl_options":{
                            'method': ['sgd'],
                            'learning_rate':[0.001,0.007,0.1],#[0.002,0.005,0.01],
                            'n_epochs':[20,30,40],#[50,100,150],
                            'reg': [0.01,0.03,0.06]#[0.01,0.02,0.05]
                            }
                }
    start=time.time()
    rcv = RandomizedSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=5,n_jobs=4) 
    rcv.fit(data)
    ex_time= round(time.time()-start,2)
    best_score = rcv.best_score['rmse']
    best_param = rcv.best_params['rmse']
    return (ex_time,best_score,best_param)

#### Tuning dataset1

In [278]:
randcv_dt1 = tuning_KNNBaseline(data1)
time1 = randcv_dt1[0]
print("Execution time for dataset1: ", round(time1,2),'s')
print()
best_score1 = randcv_dt1[1]
print("Best score for dataset1: ",best_score1)
print()
best_param1 = randcv_dt1[2]
print(best_param1)

Execution time for dataset1:  234.04 s

Best score for dataset1:  0.8950040775399983

{'k': 25, 'min_k': 13, 'sim_options': {'name': 'pearson_baseline', 'user_based': True, 'min_support': 10}, 'bsl_options': {'method': 'sgd', 'learning_rate': 0.007, 'n_epochs': 30, 'reg': 0.01}}


### Average-RMSE associated to the two best estimators you tuned for dataset1

In [23]:
#DATASET1 - KNNBASELINE
kf = KFold(n_splits=5, random_state=0)
current_algo=KNNBaseline(k=50, min_k=11, sim_options={'name': 'pearson_baseline', 'user_based':False,'min_support': 13}, bsl_options= {'method': 'sgd','learning_rate':0.005,'n_epochs': 50,'reg':0.05},verbose=True)
cross_validate(current_algo, data1, measures=['RMSE'], cv=kf,n_jobs=4, verbose=True)

Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8864  0.8900  0.8847  0.8913  0.8921  0.8889  0.0029  
Fit time          12.02   12.01   11.92   12.13   10.54   11.72   0.60    
Test time         21.89   21.96   22.32   22.16   15.07   20.68   2.81    


{'test_rmse': array([0.88635856, 0.8899913 , 0.88470392, 0.8913291 , 0.89212135]),
 'fit_time': (12.015504360198975,
  12.005348682403564,
  11.920779466629028,
  12.129212141036987,
  10.53655743598938),
 'test_time': (21.89350152015686,
  21.958795070648193,
  22.317030906677246,
  22.157458543777466,
  15.07177209854126)}

#### Tuning dataset2

In [80]:
randcv_dt2 = tuning_KNNBaseline(data2)
time2 = randcv_dt2[0]
print("Execution time for dataset2: ", round(time2,2), 's')
print()
best_score2 = randcv_dt2[1]
print("Best score for dataset2: ",best_score2)
print()
best_param2 = randcv_dt2[2]
print(best_param2)

Execution time for dataset2:  10.08 s

Best score for dataset2:  1.8623538518570164

{'k': 25, 'min_k': 9, 'sim_options': {'name': 'pearson_baseline', 'user_based': True, 'min_support': 10}, 'bsl_options': {'method': 'sgd', 'learning_rate': 0.001, 'n_epochs': 40, 'reg': 0.01}}


### Average-RMSE associated to the two best estimators you tuned for dataset2

In [82]:
#DATASET 2 - KNNBASELINE
kf = KFold(n_splits=5, random_state=0)
current_algo=KNNBaseline(k=25, min_k=9, sim_options={'name': 'pearson_baseline', 'user_based':True,'min_support':5}, bsl_options= {'method': 'sgd','learning_rate':0.001,'n_epochs': 30,'reg':0.01},verbose=True)
cross_validate(current_algo, data2, measures=['RMSE'], cv=kf,n_jobs=4, verbose=True)

Evaluating RMSE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8191  1.8738  1.8427  1.8429  1.8457  1.8448  0.0174  
Fit time          0.51    0.47    0.56    0.52    0.33    0.48    0.08    
Test time         0.94    0.92    0.86    0.93    0.47    0.82    0.18    


{'test_rmse': array([1.81913605, 1.87377336, 1.84269099, 1.84289467, 1.84572359]),
 'fit_time': (0.5133578777313232,
  0.4686112403869629,
  0.5618352890014648,
  0.5236213207244873,
  0.32998061180114746),
 'test_time': (0.9425251483917236,
  0.9194095134735107,
  0.8574347496032715,
  0.9269015789031982,
  0.4675314426422119)}

## Tuning the hyper parameter of the SVD algorithm using a Grid-Search-Cross-Validation approach

In [275]:
def tuning_SVD(data):    
    param_grid = {"n_factors":[80,100,120,150],#[25,50,100,150],
                  "lr_all": [0.005,0.008,0.1,0.5],#[0.005,0.01,0.5,1],
                  "init_mean":[0.01,0.02,0.06,0.08],#[0.10,0.30,0.50,0.70],
                  "reg_all":[0.06,0.1,0.4,0.5]#[0.01,0.05,0.07,0.1]
                 }

    start=time.time()
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5,n_jobs=4) 
    gs.fit(data)
    ex_time= round(time.time()-start,2)
    best_score = gs.best_score['rmse']
    best_param = gs.best_params['rmse']
    return (ex_time,best_score,best_param)

### DATASET1

In [279]:
grid_cv1 = tuning_SVD(data1)
time1 = grid_cv1[0]
print("Execution time for dataset1: ", round(time1,2),'s')
print()
best_score1 = grid_cv1[1]
print("Best score for dataset1: ",best_score1)
print()
best_param1 = grid_cv1[2]
print(best_param1)

Execution time for dataset1:  5060.02 s

Best score for dataset1:  0.8882010233117079

{'n_factors': 150, 'lr_all': 0.008, 'init_mean': 0.08, 'reg_all': 0.06}


#### Average-RMSE associated to the two best estimators you tuned for dataset1

In [14]:
current_algo= SVD(n_factors=100,lr_all=0.01,init_mean= 0.1,reg_all=0.07)
cross_validate(current_algo, data1, measures=['RMSE'],cv=kf, n_jobs=4,verbose=True )

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8837  0.8850  0.8827  0.8888  0.8869  0.8854  0.0022  
Fit time          20.97   20.73   20.79   21.27   12.07   19.17   3.55    
Test time         1.17    1.19    1.22    1.12    0.40    1.02    0.31    


{'test_rmse': array([0.88369718, 0.88502727, 0.88269828, 0.8887968 , 0.88694707]),
 'fit_time': (20.969177722930908,
  20.73214864730835,
  20.785199403762817,
  21.27463412284851,
  12.07303762435913),
 'test_time': (1.165158748626709,
  1.1925158500671387,
  1.2241318225860596,
  1.1227946281433105,
  0.39577412605285645)}

### DATASET2

In [276]:
grid_cv2= tuning_SVD(data2)
time2 = grid_cv2[0]
print("Execution time for dataset2: ", round(time2,2),'s')
print()
best_score2 = grid_cv2[1]
print("Best score for dataset2: ",best_score2)
print()
best_param2 = grid_cv2[2]
print(best_param2)

Execution time for dataset2:  406.97 s

Best score for dataset2:  1.8442059490805824

{'n_factors': 150, 'lr_all': 0.008, 'init_mean': 0.02, 'reg_all': 0.1}


#### Average-RMSE associated to the two best estimators you tuned for dataset2

In [277]:
current_algo= SVD(n_factors=150, lr_all=0.008,init_mean=0.02,reg_all=0.1)
cross_validate(current_algo, data2, measures=['RMSE'], cv=kf, n_jobs=4, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.8355  1.8674  1.8520  1.8540  1.8561  1.8530  0.0103  
Fit time          1.41    1.46    1.51    1.60    0.92    1.38    0.24    
Test time         0.03    0.03    0.03    0.04    0.02    0.03    0.01    


{'test_rmse': array([1.83550514, 1.86743343, 1.85196471, 1.85395224, 1.8560896 ]),
 'fit_time': (1.4106719493865967,
  1.4581594467163086,
  1.5050315856933594,
  1.598637342453003,
  0.920992374420166),
 'test_time': (0.031245946884155273,
  0.03124523162841797,
  0.03124833106994629,
  0.04499650001525879,
  0.015651702880859375)}