# Benchmark Evaluation Module
Author: Shiyi Wang

In [23]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
#from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [24]:
data = pd.read_pickle('../data/processed_data.pkl')
data = data.drop(data.index[100000:])

In [25]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(data[['user_id', 'recipe_id', 'rating']], reader)


#### NormalPredictor

* NormalPredictor algorithm predicts a random rating based on the distribution of the training set, which is assumed to be normal. This is one of the most basic algorithms that do not do much work.

#### BaselineOnly

* BasiclineOnly algorithm predicts the baseline estimate for given user and item.

### k-NN algorithms

#### KNNBasic

* KNNBasic is a basic collaborative filtering algorithm.

#### KNNWithMeans

* KNNWithMeans is basic collaborative filtering algorithm, taking into account the mean ratings of each user.

#### KNNWithZScore

* KNNWithZScore is a basic collaborative filtering algorithm, taking into account the z-score normalization of each user.

#### KNNBaseline

* KNNBaseline is a basic collaborative filtering algorithm taking into account a baseline rating.

### Matrix Factorization-based algorithms

#### SVD

* SVD algorithm is equivalent to Probabilistic Matrix Factorization (http://papers.nips.cc/paper/3208-probabilistic-matrix-factorization.pdf)

#### SVDpp

* The SVDpp algorithm is an extension of SVD that takes into account implicit ratings.

### Slope One

* Slope One is a straightforward implementation of the SlopeOne algorithm. (https://arxiv.org/abs/cs/0702144)

### Co-clustering

* Co-clustering is a collaborative filtering algorithm based on co-clustering (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.113.6458&rep=rep1&type=pdf)


We use rmse as our accuracy metric for the predictions.

In [26]:
benchmark = []
# Iterate over all algorithms

algorithms = [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBasic(k=10)]
#algorithms = [SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    try:
        print("Starting: " ,str(algorithm))
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
        # results = cross_validate(algorithm, data, measures=['RMSE','MAE'], cv=3, verbose=False)
        
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark.append(tmp)
        print("Done: " ,str(algorithm), "\n\n")
    except:
        pass

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fac6b3f8880>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7fac6b3f84c0>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7fac6b3f8cd0>, <surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x7fac6b3f88b0>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x7fac6b3f8670>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fac6b3f8880>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fac6b3f8880> 


Starting:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7fac6b3f84c0>
Done:  <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7fac6b3f84c0> 


Starting:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7fac6b3f8cd0>
Done:  <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7fac6b3f8cd0> 


Starting:  <surprise.

In [27]:
benchmark

[test_rmse    1.223165
 fit_time     4.761671
 test_time    0.432596
 Algorithm         SVD
 dtype: object,
 test_rmse     1.225148
 fit_time     27.876124
 test_time     1.717175
 Algorithm        SVDpp
 dtype: object,
 test_rmse    1.314832
 fit_time     7.274538
 test_time    1.246718
 Algorithm    SlopeOne
 dtype: object,
 test_rmse           1.568506
 fit_time            0.124535
 test_time            0.30532
 Algorithm    NormalPredictor
 dtype: object,
 test_rmse     1.279595
 fit_time     59.794599
 test_time     3.059934
 Algorithm     KNNBasic
 dtype: object]

In [28]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

In [29]:
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.223165,4.761671,0.432596
SVDpp,1.225148,27.876124,1.717175
KNNBasic,1.279595,59.794599,3.059934
SlopeOne,1.314832,7.274538,1.246718
NormalPredictor,1.568506,0.124535,0.30532


In [30]:
surprise_results.to_csv("../evaluations/benchmark_results.csv")