Table of contents:
1. Settings
    1. data settings
    2. specify hyperparameter grid
2. Data preparation
3. Benchmarking experiments. Hyperpara tuning + prediction
    1. KNN
    2. NMF
    3. SlopeOne
    4. SVD
4. Summary of results


In [43]:
import pandas as pd
import random as rnd
from myFunctions import prep
from myFunctions_benchmark import read_data,rename_to_surprise_notation
from myFunctions_benchmark import spearman_surprise, kendall_surprise, RMSE_surprise, MAE_surprise
from surprise import Reader, Dataset
from surprise import SVD, KNNBasic, SlopeOne, NMF
from surprise.model_selection import GridSearchCV
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)
rnd.seed(0)

# 1. Settings

Dataset selection

In [44]:
#Select input data:
toy_data = False
dataset_1 = False #not publicly available
dataset_2 = False #not publicly available
dataset_3 = True

Specify grid search for hyperpara tuning

In [45]:
param_grid_svd = {
    "n_factors":[5,10,15,20],
    "n_epochs":[50,100,200],
    "reg_pu":[0.01,0.02,0.05],
    "reg_qi":[0.01,0.02,0.05]
}

param_grid_nmf = {
    "n_factors":[5,10,15,20],
    "n_epochs":[50,100,200],
    "reg_pu":[0.01,0.02,0.05],
    "reg_qi":[0.01,0.02,0.05]
}

param_grid_knn = {
    "k":[1,5,10,20],
    "min_k'":[1,2,5],
    "sim_options": {'name':['cosine','MSD','pearson','pearson_baseline'],
                    'user_based':[True,False]}
}


# 2. Data preparation

In [46]:
datasets = [toy_data, dataset_1, dataset_2, dataset_3]
df = read_data(datasets)

#start time experiment:
start_time_experiment = datetime.now()

In [47]:
df,df_train_val,df_train,df_val,df_test = prep(df,test_size=0.25,out_of_time=1,one_in_train_val=1)

df_train_val=rename_to_surprise_notation(df_train_val)
df_train=rename_to_surprise_notation(df_train)
df_val=rename_to_surprise_notation(df_val)
df_test=rename_to_surprise_notation(df_test)


In [48]:
columns_to_convert = {'userId':'string','movieId':'string','rating':'float','timestamp':'string'}
df_train_val = df_train_val.astype(columns_to_convert)
df_train = df_train.astype(columns_to_convert)
df_val = df_val.astype(columns_to_convert)
df_test = df_test.astype(columns_to_convert)


In [49]:
df_train_val, df_train, df_val, df_test = [df.astype({'userId':'string','movieId':'string','rating':'float','timestamp':'string'}) for df in [df_train_val, df_train, df_val, df_test]]

In [50]:
#Create reader object
reader = Reader()

In [51]:
# Load the train_val, train, val, and test dataset.
data_train_val = Dataset.load_from_df(df_train_val[['userId', 'movieId', 'rating']], reader)
data_train = Dataset.load_from_df(df_train[['userId', 'movieId', 'rating']], reader)
data_val = Dataset.load_from_df(df_val[['userId', 'movieId', 'rating']], reader)
data_test = Dataset.load_from_df(df_test[['userId', 'movieId', 'rating']], reader)

raw_ratings_train_val = data_train_val.raw_ratings
raw_ratings_train = data_train.raw_ratings
raw_ratings_val = data_val.raw_ratings
raw_ratings_test = data_test.raw_ratings

In [52]:
data_train_val.raw_ratings = raw_ratings_train_val
data_train.raw_ratings = raw_ratings_train
data_val.raw_ratings = raw_ratings_val
data_test.raw_ratings = raw_ratings_test

trainvalset = data_train_val.build_full_trainset()
trainset = data_train.build_full_trainset()
valset = data_val.build_full_trainset()
testset = data_test.construct_testset(raw_ratings_test)

# 3. Benchmarking algorithms: KNN, NMF, SVD, SlopeOne

## 3.1 KNNBasic algorithm

In [53]:
gs_knn = GridSearchCV(KNNBasic, param_grid_knn, measures=["rmse", "mae"], cv=3)
gs_knn.fit(data_train_val)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similar

In [54]:
# best RMSE score
print(gs_knn.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_knn.best_params["rmse"])

# We can now use the algorithm that yields the best rmse:
algo_knn = gs_knn.best_estimator["rmse"]
algo_knn.fit(trainvalset)

1.009479900240483
{'k': 5, "min_k'": 1, 'sim_options': {'name': 'pearson', 'user_based': True}}
Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1f523dba2f0>

In [55]:
pred_knn = algo_knn.test(testset)
print("Results on testset with knn: ")
#accuracy.rmse(pred_knn)
print('RMSE: ' + str(RMSE_surprise(pred_knn)))
print('MAE: ' + str(MAE_surprise(pred_knn)))
#accuracy.fcp(pred_knn)
print('Spearman: ' + str(spearman_surprise(pred_knn)))
print('Kendall: ' + str(kendall_surprise(pred_knn)))
print("Selected hyperparameters:" +str((gs_knn.best_params["rmse"])))

Results on testset with knn: 
RMSE: 0.1952
MAE: 0.1616
Spearman: 0.0202
Kendall: 0.017
Selected hyperparameters:{'k': 5, "min_k'": 1, 'sim_options': {'name': 'pearson', 'user_based': True}}


## 3.2 NMF algorithm

In [56]:
### NMF algorithm
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=["rmse", "mae"], cv=3)
gs_nmf.fit(data_train_val)

# best RMSE score
print(gs_nmf.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_nmf.best_params["rmse"])

1.0253400344337715
{'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.02, 'reg_qi': 0.02}


In [57]:

# We can now use the algorithm that yields the best rmse:
algo_nmf = gs_nmf.best_estimator["rmse"]
algo_nmf.fit(trainvalset)
pred_nmf = algo_nmf.test(testset)
print("Results on testset with nmf: ")
print('RMSE: ' + str(RMSE_surprise(pred_nmf)))
print('MAE: ' + str(MAE_surprise(pred_nmf)))
print('Spearman: ' + str(spearman_surprise(pred_nmf)))
print('Kendall: ' + str(kendall_surprise(pred_nmf)))
print("Selected hyperparameters:" + str((gs_nmf.best_params["rmse"])))

Results on testset with nmf: 
RMSE: 0.1921
MAE: 0.1565
Spearman: 0.186
Kendall: 0.149
Selected hyperparameters:{'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.02, 'reg_qi': 0.02}


## 3.3 SlopeOne algorithm

In [58]:
algo_slope = SlopeOne()
algo_slope.fit(trainvalset)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x1f55e2b7730>

In [59]:
# We can now use the algorithm that yields the best rmse:
#algo_slope = gs_slope.best_estimator["rmse"]
#algo_slope.fit(trainvalset)
pred_slope = algo_slope.test(testset)
print("Results on testset with SlopeOne: ")
print('RMSE: ' + str(RMSE_surprise(pred_slope)))
print('MAE: ' + str(MAE_surprise(pred_slope)))
print('Spearman: ' + str(spearman_surprise(pred_slope)))
print('Kendall: ' + str(kendall_surprise(pred_slope)))
#print("Selected hyperparameters:" + str((gs_slope.best_params["rmse"])))

Results on testset with SlopeOne: 
RMSE: 0.2007
MAE: 0.1621
Spearman: 0.1672
Kendall: 0.1474


## 3.4 SVD algorithm

In [60]:
### NMF algorithm
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=["rmse", "mae"], cv=3)
gs_svd.fit(data_train_val)

# best RMSE score
print(gs_svd.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs_svd.best_params["rmse"])

0.9205502695916028
{'n_factors': 5, 'n_epochs': 50, 'reg_pu': 0.05, 'reg_qi': 0.02}


In [61]:
# We can now use the algorithm that yields the best rmse:
algo_svd = gs_svd.best_estimator["rmse"]
algo_svd.fit(trainvalset)
pred_svd = algo_svd.test(testset)
print("Results on testset with SVD: ")
print('RMSE: ' + str(RMSE_surprise(pred_svd)))
print('MAE: ' + str(MAE_surprise(pred_svd)))
print('Spearman: ' + str(spearman_surprise(pred_svd)))
print('Kendall: ' + str(kendall_surprise(pred_svd)))
print("Selected hyperparameters:" + str((gs_svd.best_params["rmse"])))

Results on testset with SVD: 
RMSE: 0.1801
MAE: 0.1469
Spearman: 0.1989
Kendall: 0.1592
Selected hyperparameters:{'n_factors': 5, 'n_epochs': 50, 'reg_pu': 0.05, 'reg_qi': 0.02}


### 4. Summary of results

In [62]:
models = {'svd': gs_svd, 'knn': gs_knn, 'nmf': gs_nmf, 'slope': algo_slope}
predictions = {'svd': pred_svd, 'knn': pred_knn, 'nmf': pred_nmf, 'slope': pred_slope}
metrics = {'mae': MAE_surprise, 'rmse': RMSE_surprise, 'spearman': spearman_surprise, 'kendall': kendall_surprise}

for model_name, model in models.items():
    print("\nResults on testset with {}: ".format(model_name))
    if model_name != 'slope':   #slopeOne does not have hyperparameters to tune.
        print('Best params:', model.best_params["rmse"])
    for metric_name, metric in metrics.items():
        print(metric_name.upper() + ':', metric(predictions[model_name]))


Results on testset with svd: 
Best params: {'n_factors': 5, 'n_epochs': 50, 'reg_pu': 0.05, 'reg_qi': 0.02}
MAE: 0.1469
RMSE: 0.1801
SPEARMAN: 0.1989
KENDALL: 0.1592

Results on testset with knn: 
Best params: {'k': 5, "min_k'": 1, 'sim_options': {'name': 'pearson', 'user_based': True}}
MAE: 0.1616
RMSE: 0.1952
SPEARMAN: 0.0202
KENDALL: 0.017

Results on testset with nmf: 
Best params: {'n_factors': 20, 'n_epochs': 100, 'reg_pu': 0.02, 'reg_qi': 0.02}
MAE: 0.1565
RMSE: 0.1921
SPEARMAN: 0.186
KENDALL: 0.149

Results on testset with slope: 
MAE: 0.1621
RMSE: 0.2007
SPEARMAN: 0.1672
KENDALL: 0.1474


# 5. Write results to .txt file

In [63]:
now = datetime.now()
duration = now - start_time_experiment
file_name = "Results/results_benchmark_{}.txt".format(now.strftime("%Y-%m-%d %H-%M-%S"))

# Open a new text file in write mode
with open(file_name, "w") as file:
    file.write("Results for benchmarking methods. \nStart time experiment:" + str(start_time_experiment) + "\nend time experiment:" + str(now) + "\ntime elapsed:" + str(duration))
    for model_name, model in models.items():
        file.write("\n\nResults on testset with {}: ".format(model_name))
        if model_name != 'slope':
            file.write('\nBest params:'+ str(model.best_params["rmse"]))
        for metric_name, metric in metrics.items():
            file.write("\n"+str(metric_name.upper()) + ':'+ str(metric(predictions[model_name])))
