## Collaborative Filtering Using Surprise

In [1]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import evaluate
import os
import sys
from surprise import Reader
from surprise import Dataset

In [2]:
df = pd.read_csv("user_beer_rating_facorized.csv",header=0)
df = df.rename(columns={'beer':'item'})

In [3]:
df.dtypes

user        int64
item        int64
rating    float64
dtype: object

**Loading the data from Pandas Dataframe into surprise form.**

In [4]:
# file_path = os.path.expanduser('user_beer_rating_facorized.csv')
reader = Reader(rating_scale=(1, 5))
# reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader=reader)
data.split(n_folds=5)

**Creating a basic SVD and SVD++ model and evaluating the `Root mean square error` and `Mean absolute error` through the model**

In [5]:
algo = SVD()
evaluate(algo, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.3698
MAE:  0.2643
------------
Fold 2
RMSE: 0.3701
MAE:  0.2627
------------
Fold 3
RMSE: 0.3787
MAE:  0.2665
------------
Fold 4
RMSE: 0.3704
MAE:  0.2642
------------
Fold 5
RMSE: 0.3744
MAE:  0.2655
------------
------------
Mean RMSE: 0.3727
Mean MAE : 0.2647
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.26428840883007015,
                             0.26272849683456079,
                             0.26646979548643906,
                             0.26424793253284551,
                             0.26553477426913369],
                            'rmse': [0.36982354950697655,
                             0.37012655126779703,
                             0.37872689946118532,
                             0.37035246234109054,
                             0.37438312050184619]})

In [6]:
from surprise import SVDpp
algo1 = SVDpp()
evaluate(algo1, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm SVDpp.

------------
Fold 1
RMSE: 0.3676
MAE:  0.2622
------------
Fold 2
RMSE: 0.3673
MAE:  0.2601
------------
Fold 3
RMSE: 0.3761
MAE:  0.2640
------------
Fold 4
RMSE: 0.3677
MAE:  0.2616
------------
Fold 5
RMSE: 0.3720
MAE:  0.2630
------------
------------
Mean RMSE: 0.3701
Mean MAE : 0.2622
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.26221734883579956,
                             0.2600918473169252,
                             0.2639877237471891,
                             0.26158469529747819,
                             0.26295382366801567],
                            'rmse': [0.36763823532881901,
                             0.36731697676598074,
                             0.37612183132847776,
                             0.36770036172706438,
                             0.37195648505757578]})

**Now creating a training set of the data for prediction purpose**

In [7]:
trainset = data.build_full_trainset()

In [8]:
algo.train(trainset)

In [9]:
user = 1465
item = 31
actual_rating = 4.1

In [10]:
print algo.predict(user, 31, 4.1)

user: 1465       item: 31         r_ui = 4.10   est = 4.18   {u'was_impossible': False}


**Using the SVD++ algo for prediction**

In [11]:
algo1.train(trainset)

In [12]:
print algo1.predict(user, 31, 4.1)

user: 1465       item: 31         r_ui = 4.10   est = 4.22   {u'was_impossible': False}


**Performing the Hypertuning of the parameter**

In [13]:
param_grid = {'n_epochs': [1, 5], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

In [14]:
from surprise import GridSearch

In [15]:
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])

[{'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 1}, {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 5}, {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 1}, {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 5}, {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 1}, {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}, {'lr_all': 0.005, 'reg_all': 0.6, 'n_epochs': 1}, {'lr_all': 0.005, 'reg_all': 0.6, 'n_epochs': 5}]


In [16]:
grid_search.evaluate(data)

------------
Parameters combination 1 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 1}
------------
Mean RMSE: 0.4129
Mean FCP : 0.6008
------------
------------
Parameters combination 2 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 5}
------------
Mean RMSE: 0.3902
Mean FCP : 0.6579
------------
------------
Parameters combination 3 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 1}
------------
Mean RMSE: 0.4132
Mean FCP : 0.6078
------------
------------
Parameters combination 4 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 5}
------------
Mean RMSE: 0.3927
Mean FCP : 0.6608
------------
------------
Parameters combination 5 of 8
params:  {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 1}
------------
Mean RMSE: 0.4004
Mean FCP : 0.6364
------------
------------
Parameters combination 6 of 8
params:  {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}
------------
Mean RMSE: 0.3827
Mean FCP : 0.6656
------------
------------
Parameter

In [17]:
print(grid_search.best_score['RMSE'])

0.382682855402


In [18]:
print(grid_search.best_params['RMSE'])

{'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}


In [19]:
results_df = pd.DataFrame.from_dict(grid_search.cv_results)
print(results_df)

        FCP      RMSE  lr_all  \
0  0.600811  0.412888   0.002   
1  0.657917  0.390198   0.002   
2  0.607762  0.413239   0.002   
3  0.660822  0.392739   0.002   
4  0.636413  0.400443   0.005   
5  0.665603  0.382683   0.005   
6  0.642358  0.401658   0.005   
7  0.665657  0.385915   0.005   

                                              params  reg_all  \
0  {u'lr_all': 0.002, u'reg_all': 0.4, u'n_epochs...      0.4   
1  {u'lr_all': 0.002, u'reg_all': 0.4, u'n_epochs...      0.4   
2  {u'lr_all': 0.002, u'reg_all': 0.6, u'n_epochs...      0.6   
3  {u'lr_all': 0.002, u'reg_all': 0.6, u'n_epochs...      0.6   
4  {u'lr_all': 0.005, u'reg_all': 0.4, u'n_epochs...      0.4   
5  {u'lr_all': 0.005, u'reg_all': 0.4, u'n_epochs...      0.4   
6  {u'lr_all': 0.005, u'reg_all': 0.6, u'n_epochs...      0.6   
7  {u'lr_all': 0.005, u'reg_all': 0.6, u'n_epochs...      0.6   

                                              scores  
0     {u'FCP': 0.60081117343, u'RMSE': 0.4128883147}  
1   {u

In [20]:
grid_search1 = GridSearch(SVDpp, param_grid, measures=['RMSE', 'FCP'])

[{'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 1}, {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 5}, {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 1}, {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 5}, {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 1}, {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}, {'lr_all': 0.005, 'reg_all': 0.6, 'n_epochs': 1}, {'lr_all': 0.005, 'reg_all': 0.6, 'n_epochs': 5}]


In [21]:
grid_search1.evaluate(data)

------------
Parameters combination 1 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 1}
------------
Mean RMSE: 0.4099
Mean FCP : 0.6281
------------
------------
Parameters combination 2 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.4, 'n_epochs': 5}
------------
Mean RMSE: 0.3897
Mean FCP : 0.6615
------------
------------
Parameters combination 3 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 1}
------------
Mean RMSE: 0.4110
Mean FCP : 0.6319
------------
------------
Parameters combination 4 of 8
params:  {'lr_all': 0.002, 'reg_all': 0.6, 'n_epochs': 5}
------------
Mean RMSE: 0.3924
Mean FCP : 0.6622
------------
------------
Parameters combination 5 of 8
params:  {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 1}
------------
Mean RMSE: 0.3993
Mean FCP : 0.6475
------------
------------
Parameters combination 6 of 8
params:  {'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}
------------
Mean RMSE: 0.3825
Mean FCP : 0.6657
------------
------------
Parameter

In [25]:
print(grid_search1.best_score['RMSE'])
print(grid_search1.best_params['RMSE'])

0.382518396706
{'lr_all': 0.005, 'reg_all': 0.4, 'n_epochs': 5}
