# Restaurant Recommendation System

## Base models for MVP (minimum viable product)

In [94]:
#Import standard libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [95]:
#Import surprise libraries
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, baseline_only
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise import accuracy, dataset
from surprise.dataset import DatasetAutoFolds

In [96]:
pd.set_option('display.max_columns', None)

In [97]:
!ls

Rec Sys Modelling.ipynb


In [98]:
#Let's read in our base review dataset
df = pd.read_csv('../notebooks/new_collaborative_limit.csv')

In [99]:
print(df.shape)
df.head()

(12552, 10)


Unnamed: 0.1,Unnamed: 0,business_ref,business_id,name,city,categories,review_id,user_id,stars,user_ref
0,1900,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,b31UZTy2TvnFtkfygJG40Q,bcxcQhp0sKYd9eUnEVUzPA,5,246
1,1901,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,jYxWLyWrWy8dJFQs9DEuEg,RFxjYeLW_aYLdVW3PBwFNg,4,146572
2,1903,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,GGWxoYbx_h2x7a46m0MYRA,BhYROfCjIJsKUk22_IVHig,3,211139
3,1904,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,PslbThtGZ_yOWZxAFc3GVg,J_qpI2jCkwv7vPNz_9JeqA,4,260271
4,1907,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,oRYhx_qYK5slteB5nyEAiQ,NMelfYHO9mncdmZLIABLgQ,5,491834


In [100]:
df.columns

Index(['Unnamed: 0', 'business_ref', 'business_id', 'name', 'city',
       'categories', 'review_id', 'user_id', 'stars', 'user_ref'],
      dtype='object')

In [101]:
#drop out the columns we habve no use for. 
# ratings = df.drop(columns = ['Unnamed: 0', 'name', 'city', 'categories', 'review_id', ])
ratings = df[['user_ref', 'business_ref', 'stars']]
ratings.head()

Unnamed: 0,user_ref,business_ref,stars
0,246,68,5
1,146572,68,4
2,211139,68,3
3,260271,68,4
4,491834,68,5


In [102]:
class DataSet(dataset.DatasetAutoFolds):
    #Creates data set that can be accessed by Surprise including folds for cross validation
    
    def __init__(self, df, reader):
        
        
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in zip(df['user_ref'], 
                                                                         df['business_ref'], df['stars'])]
        self.reader=reader

In [103]:
#instantiate an instance of Reader to enable surprise libraries to usethe data
reader = Reader(rating_scale=(0.0, 5.0))

In [104]:
#Load in our data to a surprise Dataset
data = DataSet(ratings, reader)

### Split data into Train Validation and Split sets

In [105]:
#Extrace raw ratings from the dataset
raw_ratings = data.raw_ratings

In [106]:
#perform data shuffle
random.shuffle(raw_ratings) 

In [107]:
#We'll use a 80/20 train test ratio and 80/20 train validate ration. Train:Validate:Test - 64:16:20
test_threshold = int(.8 * len(raw_ratings))

train_raw_ratings = raw_ratings[:test_threshold] #create the train set
test_raw_ratings = raw_ratings[test_threshold:] #creat the test set

In [108]:
val_threshold = int(.8 * len(train_raw_ratings))

val_raw_ratings = train_raw_ratings[val_threshold:] #create the validation set
train_raw_ratings = train_raw_ratings[:val_threshold] #re_assign the training set
                            

In [109]:
#Now we make the training set the data
data.raw_ratings = train_raw_ratings

## Memory based collaborative filtering
### We start with KNN Basic

In [111]:
#We'll bring in some basic text stylin to help with output clarity. 
# start = "\033[1m"
# end = "\033[0;0m"

In [112]:
def Knn_Basic(data, user, item):
    '''
    Function to run different similarity metrics across KNNBasic method. 
    
    '''
    frame = []
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBasic {} comparison: ". format(i, user))
            results = cross_validate(KNNBasic(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNBasic {} comparison: ". format(i, item))
            results = cross_validate(KNNBasic(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            
    return None


In [113]:
Knn_Basic(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBasic user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0099  0.9980  0.9986  0.9633  0.9543  0.9848  0.0219  
MAE (testset)     0.7570  0.7589  0.7665  0.7371  0.7235  0.7486  0.0159  
RMSE (trainset)   0.7542  0.7545  0.7520  0.7678  0.7685  0.7594  0.0072  
MAE (trainset)    0.5675  0.5661  0.5610  0.5747  0.5740  0.5687  0.0051  
Fit time          0.03    0.03    0.03    0.03    0.02    0.02    0.00    
Test time         0.05    0.04    0.04    0.03    0.03    0.04    0.00    



Evaluation of cosine similarity for KNNBasic item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0182  1.0209  1.0164  1.0259  1.0110  1.0185  0.0049  
MAE (testset)     0.7823  0.7839  0.7755  0.7883  0.7802  0.7820  0.

### KNNBaseline

In [114]:
def Knn_Baseline(data, user, item):
    '''
    Function to run different similarity metrics across KNNBaseline method. 
    
    '''
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBaseline {} comparison: ". format(i, user))
            cross_validate(KNNBaseline(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNBaseline {} comparison: ". format(i, item))
            cross_validate(KNNBaseline(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
    return None

In [115]:
Knn_Baseline(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBaseline user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9447  0.9467  0.9719  0.9020  0.9626  0.9456  0.0240  
MAE (testset)     0.7293  0.7228  0.7581  0.6953  0.7363  0.7284  0.0204  
RMSE (trainset)   0.7134  0.7205  0.7190  0.7289  0.7186  0.7201  0.0050  
MAE (trainset)    0.5353  0.5478  0.5384  0.5507  0.5427  0.5430  0.0057  
Fit time          0.03    0.03    0.03    0.03    0.03    0.03    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    



Evaluation of cosine similarity for KNNBaseline item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9666  0.9433  1.0057  0.9755  0.9312  0.9645  0.0260  
MAE (testset)     0.7342  0.7268  0.7748  0.7473  0.7111

### KNNWithMeans

In [116]:
def Knn_With_Means(data, user, item):
    '''
    Function to run different similarity metrics across KNNBaseline method. 
    
    '''
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBWithMeans {} comparison: ". format(i, user))
            cross_validate(KNNWithMeans(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNWithMeans {} comparison: ". format(i, item))
            cross_validate(KNNWithMeans(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
    return None

In [117]:
Knn_With_Means(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBWithMeans user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9807  0.9486  0.9480  0.9714  1.0071  0.9712  0.0220  
MAE (testset)     0.7530  0.7250  0.7292  0.7518  0.7734  0.7465  0.0177  
RMSE (trainset)   0.7125  0.7176  0.7150  0.7092  0.7075  0.7123  0.0037  
MAE (trainset)    0.5382  0.5415  0.5369  0.5350  0.5316  0.5367  0.0033  
Fit time          0.02    0.03    0.02    0.02    0.02    0.02    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    



Evaluation of cosine similarity for KNNWithMeans item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9922  1.0027  0.9593  0.9908  0.9905  0.9871  0.0146  
MAE (testset)     0.7582  0.7645  0.7453  0.7533  0

From the initial memory based model runs we can see that there is a preponderence of overfit training RMSE results and also less than spectacular test RMSE results. 

The best mean test RMSE is 0.9514 for KNN Baseline with cosine similarity. 

## Model Based Collaborative Filtering

### Singular Value Decomposition (SVD) with GridSearchCV

In [118]:
param_grid = {'n_factors': [10, 50, 250], 'n_epochs':[5, 10, 15], 'lr_all': [0.002, 0.005, 0.001], 
              'reg_all':[0.005, 0.01, 0.05]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)

surprise.model_selection.search.GridSearchCV

In [119]:
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   49.1s finished


In [120]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.9004111385150948}
{'rmse': {'n_factors': 10, 'n_epochs': 15, 'lr_all': 0.005, 'reg_all': 0.005}}


In [121]:
#Let's tune the n_factors, n_epochs and reg_all hyperparamters as the best fit is at the edges of the ranges
param_grid = {'n_factors': [10, 50, 250], 'n_epochs':[12, 15, 17], 'lr_all': [0.002, 0.005, 0.001], 
              'reg_all':[0.005, 0.01, 0.05]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)

surprise.model_selection.search.GridSearchCV

In [122]:
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   54.3s finished


In [123]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8946134541218924}
{'rmse': {'n_factors': 10, 'n_epochs': 17, 'lr_all': 0.005, 'reg_all': 0.01}}


In [124]:
#Let's keep tuning as the RMSE is still reducing. 
param_grid = {'n_factors': [5, 10, 15], 'n_epochs':[15, 17, 20], 'lr_all': [0.002, 0.005, 0.001], 
              'reg_all':[0.01, 0.05, 0.1]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   41.8s finished


In [125]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8966033265431452}
{'rmse': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}}


In [126]:
#And again we have achieved iterative improvement so we continue to tune. 
param_grid = {'n_factors': [2, 5, 8], 'n_epochs':[15, 20, 25], 'lr_all': [0.002, 0.005, 0.001], 
              'reg_all':[0.05, 0.08, 0.11]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   42.4s finished


In [127]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8958953883253387}
{'rmse': {'n_factors': 8, 'n_epochs': 25, 'lr_all': 0.005, 'reg_all': 0.11}}


In [128]:
print('Our mean train RMSE achieved with SVD is: ', grid_search.cv_results['mean_train_rmse'].mean())
print('Our mean test RMSE achieved with SVD is: ', grid_search.cv_results['mean_test_rmse'].mean()) 

Our mean train RMSE achieved with SVD is:  0.8520153091236329
Our mean test RMSE achieved with SVD is:  0.9145258211620347


### Matrix Factorisation with Alternating Least Squares or Stochastic Gradient Descent

In [129]:
#We would like to tune hyperparamters with GridSearchCV but let's see if we can automate the tuning.
#We can tune items regularization(default=10), user regularization(def=15) and the numbe rof iterations(def=10).
#We'll use the default values as the midpoint of our tuning range. 
epochs = [3, 8, 10, 12, 18]
reg_u = [8, 10, 15, 20, 23]
reg_i = [3, 8, 10, 12, 18]

params = [[i, j, k] for i in epochs
        for j in reg_u
        for k in reg_i]
print('Possible hyperparameter permutations: ', len(params))

Possible hyperparameter permutations:  125


In [130]:
bsl_options_scores = {}


for bsl_perm in params:
    bsl_options = {'method': 'als', 
                  'n_epochs': bsl_perm[0],
                  'reg_u': bsl_perm[1],
                  'reg_i': bsl_perm[2]}
    
    algo = baseline_only.BaselineOnly(bsl_options=bsl_options)
    a = cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, verbose=False);
    
    bsl_perm_2 = (str(bsl_perm[0]) + ' ' + str(bsl_perm[1]) + ' ' + str(bsl_perm[2]))
    bsl_options_scores[bsl_perm_2] = {'mean_train_rmse': a['train_rmse'].mean(), 'mean_test_rmse': a['test_rmse'].mean()}

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [131]:
sorted([[k, v] for k, v in bsl_options_scores.items()], key = lambda x: x[1]['mean_train_rmse'])[:10]

[['12 8 3',
  {'mean_train_rmse': 0.765000700628215,
   'mean_test_rmse': 0.8920856604456269}],
 ['8 8 3',
  {'mean_train_rmse': 0.765243974786446,
   'mean_test_rmse': 0.8897639619646764}],
 ['10 8 3',
  {'mean_train_rmse': 0.7652690015975401,
   'mean_test_rmse': 0.8898425696324189}],
 ['18 8 3',
  {'mean_train_rmse': 0.7656088546563178,
   'mean_test_rmse': 0.8874418397323242}],
 ['3 8 3',
  {'mean_train_rmse': 0.7656091296958087,
   'mean_test_rmse': 0.8888094285718273}],
 ['18 10 3',
  {'mean_train_rmse': 0.7673972699435068,
   'mean_test_rmse': 0.8922374522570922}],
 ['12 10 3',
  {'mean_train_rmse': 0.7677787974534376,
   'mean_test_rmse': 0.890206761960212}],
 ['10 10 3',
  {'mean_train_rmse': 0.7679249744083433,
   'mean_test_rmse': 0.8890025237756142}],
 ['8 10 3',
  {'mean_train_rmse': 0.7680248373411636,
   'mean_test_rmse': 0.8893573064303245}],
 ['3 10 3',
  {'mean_train_rmse': 0.7680640980316991,
   'mean_test_rmse': 0.8896932385590425}]]

The optimimum hyperparameters are located on the edge of the ranges. We should retune to see if we can improve the rmse. 

In [132]:
#On the previous run we had an optimum result at 18, 8, 3. Lets reduce the range for reg_u and reg_i to see if there
#is any room for improvement. 
epochs = [15, 18, 20, 25]
reg_u = [3, 5, 8, 12, 15]
reg_i = [2, 3, 4, 5]

params = [[i, j, k] for i in epochs
        for j in reg_u
        for k in reg_i]
print('Possible hyperparameter permutations: ', len(params))


bsl_options_scores = {}


for bsl_perm in params:
    bsl_options = {'method': 'als', 
                  'n_epochs': bsl_perm[0],
                  'reg_u': bsl_perm[1],
                  'reg_i': bsl_perm[2]}
    
    algo = baseline_only.BaselineOnly(bsl_options=bsl_options)
    a = cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, verbose=False);
    
    bsl_perm_2 = (str(bsl_perm[0]) + ' ' + str(bsl_perm[1]) + ' ' + str(bsl_perm[2]))
    bsl_options_scores[bsl_perm_2] = {'mean_train_rmse': a['train_rmse'].mean(), 'mean_test_rmse': a['test_rmse'].mean()}

Possible hyperparameter permutations:  80
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als.

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [133]:
sorted([[k, v] for k, v in bsl_options_scores.items()], key = lambda x: x[1]['mean_train_rmse'])[:10]

[['18 3 2',
  {'mean_train_rmse': 0.7426245770395846,
   'mean_test_rmse': 0.8994048244475124}],
 ['25 3 2',
  {'mean_train_rmse': 0.743137762884701,
   'mean_test_rmse': 0.8961589972681743}],
 ['20 3 2',
  {'mean_train_rmse': 0.7433919825531489,
   'mean_test_rmse': 0.8953963850820719}],
 ['15 3 2',
  {'mean_train_rmse': 0.7442551662273251,
   'mean_test_rmse': 0.8884144183680489}],
 ['18 5 2',
  {'mean_train_rmse': 0.7455381016295549,
   'mean_test_rmse': 0.901069825782886}],
 ['25 5 2',
  {'mean_train_rmse': 0.7470877955423078,
   'mean_test_rmse': 0.8899295717276564}],
 ['15 5 2',
  {'mean_train_rmse': 0.7471315641127286,
   'mean_test_rmse': 0.8892700366910307}],
 ['20 5 2',
  {'mean_train_rmse': 0.747243647172992,
   'mean_test_rmse': 0.8890149109232034}],
 ['25 8 2',
  {'mean_train_rmse': 0.7508400742895486,
   'mean_test_rmse': 0.8952656383637653}],
 ['18 8 2',
  {'mean_train_rmse': 0.7508538522610693,
   'mean_test_rmse': 0.8943305777093901}]]

The attempt to improve the hyperparameters did not yield any improved results. in fact training data is slightly more overfit and the test RMSE is slightly worse. 
So we can progress using the preferred ALS model based filter as our current optimum model. 

In [134]:
#Let's run a cross validation based on the preferred ALS hyperparamters. 
best_bsl_option = {'method': 'als', 
                   'n_epochs': 12,
                   'reg_u': 8, 
                   'reg_iu': 3}

best_algo = baseline_only.BaselineOnly(bsl_options=best_bsl_option)
best_cv = cross_validate(best_algo, data, measures=['rmse'], cv=5, verbose=True, return_train_measures=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8735  0.8950  0.9147  0.9109  0.9103  0.9009  0.0152  
RMSE (trainset)   0.8234  0.8191  0.8141  0.8154  0.8148  0.8174  0.0035  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.01    0.05    0.01    0.01    0.01    0.01    0.02    


## Model Predictions and Evaluation

In [135]:
#we now retrain on the whole training set ie training and validation together!
trainset = data.build_full_trainset()
best_algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1a1e2c52b0>

In [136]:
# testset = data.construct_testset(test_raw_ratings) #testset is now the test set sample as created at top of page
# predictions = best_algo.test(testset)
# print('Test set accuracy is: ', end=' ')
# accuracy.rmse(predictions)

In [137]:
valset = data.construct_testset(val_raw_ratings)
predictions = best_algo.test(valset)
print('Validation set accuracy is: {}' .format(accuracy.rmse(predictions)))

RMSE: 0.8713
Validation set accuracy is: 0.8713428499778241


Using our original dataframe we can now undertake a quick comparison to see how our model would predict a small sample of from these know values. 

In [138]:
ratings.head(10)

Unnamed: 0,user_ref,business_ref,stars
0,246,68,5
1,146572,68,4
2,211139,68,3
3,260271,68,4
4,491834,68,5
5,1168645,68,4
6,35710,380,5
7,77088,380,4
8,438436,380,5
9,590862,380,4


In [139]:
ratings.shape

(12552, 3)

Now we can call our model and see what we would predict each user would rate for the given businesses. 

In [140]:
for i in range(10):
    prediction = round(best_algo.predict(uid = ratings.user_ref[i], iid = ratings.business_ref[i])[3], 2)
    print(ratings.user_ref[i], '  ', ratings.business_ref[i], '   ', ratings.stars[i], '   ', 
         prediction, '   ', round(abs(ratings.stars[i]-prediction), 2))

246    68     5     4.06     0.94
146572    68     4     4.38     0.38
211139    68     3     4.06     1.06
260271    68     4     3.9     0.1
491834    68     5     3.85     1.15
1168645    68     4     3.89     0.11
35710    380     5     4.15     0.85
77088    380     4     4.08     0.08
438436    380     5     4.18     0.82
590862    380     4     3.94     0.06


### Create table to show accuracy of current collaborative model

In [189]:

pred_df = ratings.copy().head(10)
d = pd.DataFrame()

for i in range(10):
    prediction = round(best_algo.predict(uid = ratings.user_ref[i], iid = ratings.business_ref[i])[3], 2)
    difference = round(abs(ratings.stars[i]-prediction), 2)
#     print(ratings.user_ref[i], '  ', ratings.business_ref[i], '   ', ratings.stars[i], '   ', 
#          prediction, '   ', difference)
    
    temp = pd.DataFrame(
        {
            'Prediction': prediction,
            'Difference': difference
        
        }, 
        index=[0]
    
    
    )
    
    d = pd.concat([d, temp], axis=0)
    
    
    
d = d.reset_index().drop(columns =['index'])

pred_acc = pd.concat([pred_df, d], axis=1)
pred_acc


Unnamed: 0,user_ref,business_ref,stars,Prediction,Difference
0,246,68,5,4.06,0.94
1,146572,68,4,4.38,0.38
2,211139,68,3,4.06,1.06
3,260271,68,4,3.9,0.1
4,491834,68,5,3.85,1.15
5,1168645,68,4,3.89,0.11
6,35710,380,5,4.15,0.85
7,77088,380,4,4.08,0.08
8,438436,380,5,4.18,0.82
9,590862,380,4,3.94,0.06


As expected there is some variance in our predictions and we are ~0.6 of a point out on average across this small sample. On the scale we're working with that's an average of ~15% error on average but in worst case predictions we are ~27% out on the real world value. There is clearly some room for improvement here and we shall investigate further in the next phase of the project. 

We'll progress to make a simple recommender based on this model. 
The following recommender may be a little crude for the purposes of recommending restaurants however it is a good kicking off point. It works by asking a new user to rate previous restaurants they may have visited and then offers them a basket of new restaurants that the may wish to visit.

Future iterations of this model will likely ask a new user to rate how they feel about certain aspects of a restaurant that they would like to visits such as 'music', 'outdoor-seating', 'fish', 'chinese', 'cosy' etc etc. This approach together with taking the best rated / most popular restaurants will be one of the potential improvements that could be made. 

In [204]:
#We need to import a separate dataset to help with the next section. the dataframe should have three columns including
#busniness_id, name, categories

In [294]:
restaurants = pd.read_csv('../notebooks/new_restaurants.csv')
# print(restaurants.columns)
restaurants.head(2)

Unnamed: 0.1,Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,business_ref
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Cannonmills,"""5 Canonmills""",Edinburgh,MLN,EH3 5HA,55.962444,-3.197662,4.5,16,1,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,68
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Grassmarket,"""13 George IV Bridge, Old Town""",Edinburgh,EDH,EH1 1EE,55.94796,-3.192143,4.0,13,1,Restaurants;Food;Sandwiches;Coffee & Tea;Delis,380


In [92]:
restaurants = restaurants[['business_ref', 'name', 'categories']].replace({';':', '}, regex=True)
restaurants.head(2)

NameError: name 'restaurants' is not defined

In [91]:
def restaurant_rater(df, num, category=None):
    userID = 2011
    ratings_info = []
    
    while num > 0:
        if category:
            restaurant = df[df['categories'].str.contains(category)].sample(1)
        else:
            restaurant = df.sample(1)
        print(restaurant)
        rating = input("how do you rate this restaurant on a scale of 1-5, press n if you don't know:\n")
        
        if rating == 'n':
            continue
        else: 
            rating_one_restaurant = {'user_ref': userID, 'business_ref':restaurant['business_ref'].values[0],
                                     'stars': rating}
            ratings_info.append(rating_one_restaurant)
            num-=1
    return ratings_info

user_rating = restaurant_rater(restaurants, 5, 'Italian')

NameError: name 'restaurants' is not defined

In [388]:
new_ratings_df = ratings.append(user_rating, ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df, reader)

In [389]:
svd_ = SVD(n_factors=5, reg_all=0.02)
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a259bb0f0>

In [368]:
# best_algo.fit(new_data.build_full_trainset())

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1a26031358>

In [390]:
list_of_restaurants = []
for rest in ratings['business_ref'].unique():
    list_of_restaurants.append((rest, svd_.predict(2011,rest)[3]))

In [391]:
ranked_recommended = sorted(list_of_restaurants, key=lambda x:x[1], reverse=True)
ranked_recommended

[(49357, 4.203694894319014),
 (166894, 4.153924938579336),
 (99404, 4.112568421657916),
 (85492, 4.0380421932826644),
 (10769, 4.036536907644167),
 (70513, 4.010498039640349),
 (157250, 4.005352084194149),
 (150652, 3.9946109039620517),
 (134710, 3.9876993328230803),
 (67590, 3.9876334471362695),
 (124483, 3.9796146345877426),
 (59942, 3.9739480547963755),
 (80247, 3.972363761311794),
 (55998, 3.9239160806812774),
 (111120, 3.92057796668854),
 (110814, 3.9077764730381164),
 (123726, 3.882416318119744),
 (38944, 3.873841736163679),
 (53353, 3.871100031764953),
 (153344, 3.8638473882702797),
 (22594, 3.8593772028448674),
 (43045, 3.858154856283659),
 (51728, 3.8440845741687744),
 (11395, 3.8430812990390777),
 (4906, 3.8425698881315435),
 (54029, 3.8394134042564128),
 (155464, 3.831085318743863),
 (12323, 3.8244215424483556),
 (21457, 3.823350787732551),
 (156330, 3.8170779354019246),
 (66305, 3.8105248630314277),
 (151660, 3.8104110801500033),
 (128169, 3.794990920810198),
 (86130, 3.794

And now we want to be able to return the actual restaurant names not just the id's from the ratings df. We'll write a function to extract this information. 

In [392]:
# return the top n recommendations 
def restaurant_recommender(user_ratings, restaurant_name_df, n):
    for index, rec in enumerate(user_ratings):
        name = restaurant_name_df.loc[restaurant_name_df['business_ref'] == int(rec[0])]['name']
        print('Recommendation # ', index+1, ': ', name, '\n')
        n-=1
        if n == 0:
            break
restaurant_recommender(ranked_recommended, restaurants, 10)

Recommendation #  1 :  450    "Martin Wishart"
Name: name, dtype: object 

Recommendation #  2 :  1532    "Hanedan"
Name: name, dtype: object 

Recommendation #  3 :  895    "The Scotch Malt Whisky Society"
Name: name, dtype: object 

Recommendation #  4 :  773    "Field"
Name: name, dtype: object 

Recommendation #  5 :  95    "Royal Mile"
Name: name, dtype: object 

Recommendation #  6 :  653    "Hotel Chocolat Cafe"
Name: name, dtype: object 

Recommendation #  7 :  1448    "Patisserie Madeleine"
Name: name, dtype: object 

Recommendation #  8 :  1378    "Kismot"
Name: name, dtype: object 

Recommendation #  9 :  1253    "Noor Indian Takeaway"
Name: name, dtype: object 

Recommendation #  10 :  625    "Kebab Mahal"
Name: name, dtype: object 



## Content Based Models
### Get most similar restaurants based on categorical and review information

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

In [52]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel

In [25]:
df = pd.read_csv('../notebooks/content.csv')
print(df.shape)
df.head()

(1605, 6)


Unnamed: 0.1,Unnamed: 0,business_id,name,city,categories,text
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,"Breakfast & Brunch, Diners, Restaurants, Cafes...",When Blythe told me he'd checked out a new spo...
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Edinburgh,"Restaurants, Food, Sandwiches, Coffee & Tea, D...",Global Deli is a great find if you're feeling ...
2,397,Di5ApLgoQpcv5Aew82fI_A,"""The Rendezvous""",Edinburgh,"Restaurants, Cantonese, Chinese",Not been to this restaurant for about 2 years ...
3,420,OvbLKXkJCg8ZMHX9L5faIA,"""Bread Meats Bread""",Edinburgh,"Burgers, Restaurants",I know people rave about this place so I'm sur...
4,446,T2jfXhvQPk9wLdt1OVV-Kg,"""Rose Street Brewery""",Edinburgh,"Pubs, Whiskey Bars, Nightlife, Breakfast & Bru...",One of many spots on Rose St. A good variety o...


For our minimum viable product we will utilise the categorical data to find similarities between restaurants. 

In [51]:
df['categories'] = df['categories'].str.lower()
df['categories'] = df['categories'].apply(lambda x: x.strip())
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,city,categories,text
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,"breakfast & brunch, diners, restaurants, cafes...",When Blythe told me he'd checked out a new spo...
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Edinburgh,"restaurants, food, sandwiches, coffee & tea, d...",Global Deli is a great find if you're feeling ...
2,397,Di5ApLgoQpcv5Aew82fI_A,"""The Rendezvous""",Edinburgh,"restaurants, cantonese, chinese",Not been to this restaurant for about 2 years ...
3,420,OvbLKXkJCg8ZMHX9L5faIA,"""Bread Meats Bread""",Edinburgh,"burgers, restaurants",I know people rave about this place so I'm sur...
4,446,T2jfXhvQPk9wLdt1OVV-Kg,"""Rose Street Brewery""",Edinburgh,"pubs, whiskey bars, nightlife, breakfast & bru...",One of many spots on Rose St. A good variety o...


We know from the restauranty EDA that there are some high frequency category words which we don't want for modelling purposes. This is because they don't tell us anything specific about the establishment.\
The words we'll remove on this initial run are 'Restaurant' & 'Food'. We have to assume that the establishments in question are 'restaurants' as that's a condition of entry into the dataset and 'food' as that's a condition of being a restaurant! 

In [39]:
#create required stop words list
stopwords = nltk.corpus.stopwords.words('english')
add_stopwords = ['restaurants', 'food']
stopwords.extend(add_stopwords)

In [40]:
df.isna().sum() #double check no NANS

Unnamed: 0     0
business_id    0
name           0
city           0
categories     0
text           0
dtype: int64

In [47]:
tfv = TfidfVectorizer(min_df=3, max_df=1600, max_features=None, strip_accents='unicode', 
                    analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), stop_words=stopwords)

In [48]:
tfv_matrix = tfv.fit_transform(df['categories'])

In [50]:
print(tfv_matrix.shape)
tfv_matrix

(1605, 396)


<1605x396 sparse matrix of type '<class 'numpy.float64'>'
	with 7457 stored elements in Compressed Sparse Row format>

In [56]:
lin = linear_kernel(tfv_matrix, tfv_matrix)

In [57]:
lin[0]

array([1.        , 0.        , 0.        , ..., 0.        , 0.        ,
       0.12050794])

In [80]:
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [81]:
indices[300:320]

name
Antojito Cantina                   300
Lazeez Tandoori                    301
Crazy Ivans                        302
Dean Gallery Café                  303
Joseph Pearce                      304
Cafe Cassis                        305
Serrano Manchego                   306
Pierinos Take Away Food Shops      307
Broughton Delicatessen and Café    308
Lazy Lohans                        309
On Tap Medina                      310
Zazou Cruises                      311
Spirit of Thai                     312
Michelles Place                    313
Frankie  Bennys                    314
Jackson Restaurant                 315
Zizzi                              316
Buckstone Pub  Kitchen             317
Dolphin Fish Bar                   318
Usquabae                           319
dtype: int64

In [82]:
df.name = df.name.str.replace('[^\w\s]', '')
df.name.loc[302]

'Crazy Ivans'

In [83]:
indices['Crazy Ivans']

302

In [84]:
def recommender(name, lin=lin):
    """
    This function takes a restaurant title as an argument and 
    returns the n=5 most similar resstaurants based on the content
    of the restaurant categories and the limnear similarity scores.  
    
    """
    
    #First we get the index corresponding to the argument / original title
    index = indices[name]
    
    #Then we fetch all the sigmoid scores for the pairwise comparisons with the argument title
    lin_scores = list(enumerate(lin[index]))
    
    #Now we sort the scores so the top recommended movies are at the top
    sorted_lin_scores = sorted(lin_scores, key=lambda x: x[1], reverse=True)
    
    #Slice the top off to give us a list equal to n=5
    top_lin_scores = sorted_lin_scores[1:6]
    
    #Identify the movie indices corresponding to the above list
    restaurant_index = [i[0] for i in top_lin_scores]
    
    #Now find the title names at these idices and return them as our recommendations!!
    return df['name'].iloc[restaurant_index]

In [90]:
recommender('Buckstone Pub  Kitchen')

344                MPs Bistro
481        Forage and Chatter
595         Turquoise Thistle
644                Stac Polly
651    A Room in the West End
Name: name, dtype: object