# Restaurant Recommendation System

## Base models for MVP (minimum viable product)

In [1]:
#Import standard libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

In [2]:
#Import surprise libraries
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, baseline_only
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
from surprise import accuracy, dataset
from surprise.dataset import DatasetAutoFolds

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
!ls

Rec Sys Modelling.ipynb


In [5]:
#Let's read in our base review dataset
df = pd.read_csv('../data/new_collaborative_limit.csv')

In [6]:
print(df.shape)
df.head()

(12552, 10)


Unnamed: 0.1,Unnamed: 0,business_ref,business_id,name,city,categories,review_id,user_id,stars,user_ref
0,1900,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,b31UZTy2TvnFtkfygJG40Q,bcxcQhp0sKYd9eUnEVUzPA,5,246
1,1901,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,jYxWLyWrWy8dJFQs9DEuEg,RFxjYeLW_aYLdVW3PBwFNg,4,146572
2,1903,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,GGWxoYbx_h2x7a46m0MYRA,BhYROfCjIJsKUk22_IVHig,3,211139
3,1904,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,PslbThtGZ_yOWZxAFc3GVg,J_qpI2jCkwv7vPNz_9JeqA,4,260271
4,1907,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,oRYhx_qYK5slteB5nyEAiQ,NMelfYHO9mncdmZLIABLgQ,5,491834


In [7]:
df.columns

Index(['Unnamed: 0', 'business_ref', 'business_id', 'name', 'city',
       'categories', 'review_id', 'user_id', 'stars', 'user_ref'],
      dtype='object')

In [8]:
#drop out the columns we habve no use for. 
# ratings = df.drop(columns = ['Unnamed: 0', 'name', 'city', 'categories', 'review_id', ])
ratings = df[['user_ref', 'business_ref', 'stars']]
ratings.head()

Unnamed: 0,user_ref,business_ref,stars
0,246,68,5
1,146572,68,4
2,211139,68,3
3,260271,68,4
4,491834,68,5


In [9]:
class DataSet(dataset.DatasetAutoFolds):
    #Creates data set that can be accessed by Surprise including folds for cross validation
    
    def __init__(self, df, reader):
        
        
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in zip(df['user_ref'], 
                                                                         df['business_ref'], df['stars'])]
        self.reader=reader

In [10]:
#instantiate an instance of Reader to enable surprise libraries to usethe data
reader = Reader(rating_scale=(0.0, 5.0))

In [11]:
#Load in our data to a surprise Dataset
data = DataSet(ratings, reader)

### Split data into Train Validation and Split sets

In [12]:
#Extrace raw ratings from the dataset
raw_ratings = data.raw_ratings

In [13]:
#perform data shuffle
random.shuffle(raw_ratings) 

In [14]:
#We'll use a 80/20 train test ratio and 80/20 train validate ration. Train:Validate:Test - 64:16:20
test_threshold = int(.8 * len(raw_ratings))

train_raw_ratings = raw_ratings[:test_threshold] #create the train set
test_raw_ratings = raw_ratings[test_threshold:] #creat the test set

In [15]:
val_threshold = int(.8 * len(train_raw_ratings))

val_raw_ratings = train_raw_ratings[val_threshold:] #create the validation set
train_raw_ratings = train_raw_ratings[:val_threshold] #re_assign the training set
                            

In [16]:
#Now we make the training set the data
data.raw_ratings = train_raw_ratings

## Memory based collaborative filtering
### We start with KNN Basic

In [17]:
#We'll bring in some basic text stylin to help with output clarity. 
# start = "\033[1m"
# end = "\033[0;0m"

In [18]:
def Knn_Basic(data, user, item):
    '''
    Function to run different similarity metrics across KNNBasic method. 
    
    '''
    frame = []
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBasic {} comparison: ". format(i, user))
            results = cross_validate(KNNBasic(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNBasic {} comparison: ". format(i, item))
            results = cross_validate(KNNBasic(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            
    return None


In [19]:
Knn_Basic(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBasic user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9668  0.9758  0.9864  0.9767  0.9933  0.9798  0.0092  
MAE (testset)     0.7378  0.7486  0.7554  0.7499  0.7531  0.7490  0.0061  
RMSE (trainset)   0.7532  0.7578  0.7451  0.7534  0.7481  0.7515  0.0045  
MAE (trainset)    0.5620  0.5666  0.5577  0.5647  0.5583  0.5618  0.0035  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    



Evaluation of cosine similarity for KNNBasic item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0118  1.0017  1.0122  0.9969  1.0250  1.0095  0.0097  
MAE (testset)     0.7670  0.7629  0.7784  0.7705  0.7918  0.7741  0.

### KNNBaseline

In [20]:
def Knn_Baseline(data, user, item):
    '''
    Function to run different similarity metrics across KNNBaseline method. 
    
    '''
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBaseline {} comparison: ". format(i, user))
            cross_validate(KNNBaseline(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNBaseline {} comparison: ". format(i, item))
            cross_validate(KNNBaseline(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
    return None

In [21]:
Knn_Baseline(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBaseline user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9571  0.9574  0.9379  0.9334  0.9610  0.9494  0.0114  
MAE (testset)     0.7334  0.7458  0.7217  0.7172  0.7437  0.7324  0.0114  
RMSE (trainset)   0.7084  0.7091  0.7147  0.7130  0.7128  0.7116  0.0024  
MAE (trainset)    0.5331  0.5315  0.5398  0.5350  0.5380  0.5355  0.0031  
Fit time          0.03    0.03    0.04    0.03    0.03    0.03    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    



Evaluation of cosine similarity for KNNBaseline item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9581  0.9223  0.9462  0.9859  0.9349  0.9495  0.0217  
MAE (testset)     0.7424  0.7049  0.7307  0.7553  0.7242

### KNNWithMeans

In [22]:
def Knn_With_Means(data, user, item):
    '''
    Function to run different similarity metrics across KNNBaseline method. 
    
    '''
    similarity_met = ['cosine', 'msd', 'pearson']
    user_item= [True, False]
    for i in similarity_met:
            #user-user similarities
            print("Evaluation of {} similarity for KNNBWithMeans {} comparison: ". format(i, user))
            cross_validate(KNNWithMeans(sim_options={'name': i, 'user_based': True}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
            
            #item-item similarities
            print("Evaluation of {} similarity for KNNWithMeans {} comparison: ". format(i, item))
            cross_validate(KNNWithMeans(sim_options={'name': i, 'user_based': False}), 
                           data=data, cv=5, return_train_measures=True, n_jobs=-1, verbose = True)
            print('\n\n')
    return None

In [23]:
Knn_With_Means(data, 'user-user', 'item-item')

Evaluation of cosine similarity for KNNBWithMeans user-user comparison: 
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9843  0.9867  0.9396  0.9618  0.9722  0.9689  0.0171  
MAE (testset)     0.7559  0.7544  0.7171  0.7456  0.7497  0.7445  0.0142  
RMSE (trainset)   0.7060  0.7024  0.7093  0.7081  0.7073  0.7066  0.0024  
MAE (trainset)    0.5302  0.5282  0.5334  0.5316  0.5304  0.5308  0.0017  
Fit time          0.02    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.03    0.03    0.03    0.03    0.03    0.03    0.00    



Evaluation of cosine similarity for KNNWithMeans item-item comparison: 
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9895  1.0051  0.9920  0.9627  1.0009  0.9900  0.0148  
MAE (testset)     0.7485  0.7622  0.7560  0.7422  0

From the initial memory based model runs we can see that there is a preponderence of overfit training RMSE results and also less than spectacular test RMSE results. 

The best mean test RMSE is 0.9514 for KNN Baseline with cosine similarity. 

## Model Based Collaborative Filtering

### Singular Value Decomposition (SVD) with GridSearchCV

#### Grid Search Run 1

In [24]:
param_grid = {'n_factors': [10, 50, 250], 'n_epochs':[5, 10, 15], 'lr_all': [0.002, 0.005, 0.01], 
              'reg_all':[0.005, 0.01, 0.05]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)

surprise.model_selection.search.GridSearchCV

In [25]:
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   48.7s finished


In [26]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8844288082753238}
{'rmse': {'n_factors': 10, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.05}}


#### Grid Search Run 2

In [27]:
#Let's tune the n_factors, n_epochs and reg_all hyperparamters as the best fit is at the edges of the ranges
param_grid = {'n_factors': [10, 50, 250], 'n_epochs':[12, 15, 17], 'lr_all': [0.002, 0.005, 0.01], 
              'reg_all':[0.005, 0.01, 0.05]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)

surprise.model_selection.search.GridSearchCV

In [28]:
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   53.2s finished


In [29]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8847694759225533}
{'rmse': {'n_factors': 10, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.05}}


#### Grid Search Run 3

In [30]:
#Let's keep tuning as the RMSE is still reducing. 
param_grid = {'n_factors': [5, 10, 15], 'n_epochs':[15, 17, 20], 'lr_all': [0.002, 0.005, 0.01], 
              'reg_all':[0.01, 0.05, 0.1]}

grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(grid_search)
grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   41.2s finished


In [31]:
print(grid_search.best_score)
print(grid_search.best_params)

{'rmse': 0.8809104121329755}
{'rmse': {'n_factors': 5, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.1}}


#### Grid Search Run 4

In [32]:
#And again we have achieved iterative improvement so we continue to tune. 
param_grid = {'n_factors': [8, 10, 12], 'n_epochs':[20, 25, 30], 'lr_all': [0.005, 0.01, 0.03], 
              'reg_all':[0.08, 0.11, 0.13]}

opt_grid_search = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse'],
                           cv=5, n_jobs=-1, return_train_measures= True, joblib_verbose=5)
        
type(opt_grid_search)
opt_grid_search.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   42.5s finished


In [33]:
print(opt_grid_search.best_score)
print(opt_grid_search.best_params)

{'rmse': 0.8804283383656022}
{'rmse': {'n_factors': 10, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.13}}


In [34]:
print('Our mean train RMSE achieved with SVD is: ', opt_grid_search.cv_results['mean_train_rmse'].mean())
print('Our mean test RMSE achieved with SVD is: ', opt_grid_search.cv_results['mean_test_rmse'].mean()) 

Our mean train RMSE achieved with SVD is:  0.6630173633656051
Our mean test RMSE achieved with SVD is:  0.8894711117051208


Using the optimum output from run 4 we can see that the SVD model is overfitting and we report that there is instability in the model as on repeated runs we achieve small but potentially significant differences in the mean RMSE for both test and training sets. 

### Matrix Factorisation with Alternating Least Squares or Stochastic Gradient Descent

In [35]:
#We would like to tune hyperparamters with GridSearchCV but let's see if we can automate the tuning.
#We can tune items regularization(default=10), user regularization(def=15) and the numbe rof iterations(def=10).
#We'll use the default values as the midpoint of our tuning range. 
epochs = [3, 8, 10, 12, 18]
reg_u = [8, 10, 15, 20, 23]
reg_i = [3, 8, 10, 12, 18]

params = [[i, j, k] for i in epochs
        for j in reg_u
        for k in reg_i]
print('Possible hyperparameter permutations: ', len(params))

Possible hyperparameter permutations:  125


In [36]:
bsl_options_scores = {}


for bsl_perm in params:
    bsl_options = {'method': 'als', 
                  'n_epochs': bsl_perm[0],
                  'reg_u': bsl_perm[1],
                  'reg_i': bsl_perm[2]}
    
    algo = baseline_only.BaselineOnly(bsl_options=bsl_options)
    a = cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, verbose=False);
    
    bsl_perm_2 = (str(bsl_perm[0]) + ' ' + str(bsl_perm[1]) + ' ' + str(bsl_perm[2]))
    bsl_options_scores[bsl_perm_2] = {'mean_train_rmse': a['train_rmse'].mean(), 'mean_test_rmse': a['test_rmse'].mean()}

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [37]:
sorted([[k, v] for k, v in bsl_options_scores.items()], key = lambda x: x[1]['mean_train_rmse'])[:10]

[['8 8 3',
  {'mean_train_rmse': 0.7565085456222658,
   'mean_test_rmse': 0.8809107030146871}],
 ['18 8 3',
  {'mean_train_rmse': 0.7568331664105118,
   'mean_test_rmse': 0.879115094437417}],
 ['12 8 3',
  {'mean_train_rmse': 0.7568790762577661,
   'mean_test_rmse': 0.8783716602724713}],
 ['10 8 3',
  {'mean_train_rmse': 0.7569712834330165,
   'mean_test_rmse': 0.8784688544339649}],
 ['3 8 3',
  {'mean_train_rmse': 0.7570432448662135,
   'mean_test_rmse': 0.8789214626263903}],
 ['18 10 3',
  {'mean_train_rmse': 0.7587792756004599,
   'mean_test_rmse': 0.8816826666136623}],
 ['10 10 3',
  {'mean_train_rmse': 0.7589286840078798,
   'mean_test_rmse': 0.8811137227543533}],
 ['12 10 3',
  {'mean_train_rmse': 0.7592696733375579,
   'mean_test_rmse': 0.8796485848232892}],
 ['8 10 3',
  {'mean_train_rmse': 0.75930163904315, 'mean_test_rmse': 0.8795673002434944}],
 ['3 10 3',
  {'mean_train_rmse': 0.7594474284556995,
   'mean_test_rmse': 0.8789923337590114}]]

The optimimum hyperparameters are located on the edge of the ranges. We should retune to see if we can improve the rmse. 

In [38]:
#On the previous run we had an optimum result at 18, 8, 3. Lets reduce the range for reg_u and reg_i to see if there
#is any room for improvement. 
epochs = [15, 18, 20, 25]
reg_u = [3, 5, 8, 12, 15]
reg_i = [2, 3, 4, 5]

params = [[i, j, k] for i in epochs
        for j in reg_u
        for k in reg_i]
print('Possible hyperparameter permutations: ', len(params))


bsl_options_scores = {}


for bsl_perm in params:
    bsl_options = {'method': 'als', 
                  'n_epochs': bsl_perm[0],
                  'reg_u': bsl_perm[1],
                  'reg_i': bsl_perm[2]}
    
    algo = baseline_only.BaselineOnly(bsl_options=bsl_options)
    a = cross_validate(algo, data, measures=['RMSE'], cv=5, return_train_measures=True, verbose=False);
    
    bsl_perm_2 = (str(bsl_perm[0]) + ' ' + str(bsl_perm[1]) + ' ' + str(bsl_perm[2]))
    bsl_options_scores[bsl_perm_2] = {'mean_train_rmse': a['train_rmse'].mean(), 'mean_test_rmse': a['test_rmse'].mean()}

Possible hyperparameter permutations:  80
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als.

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [39]:
sorted([[k, v] for k, v in bsl_options_scores.items()], key = lambda x: x[1]['mean_train_rmse'])[:10]

[['15 3 2',
  {'mean_train_rmse': 0.7346743978013822,
   'mean_test_rmse': 0.887624566023487}],
 ['25 3 2',
  {'mean_train_rmse': 0.734952060810619,
   'mean_test_rmse': 0.8851111652394252}],
 ['18 3 2',
  {'mean_train_rmse': 0.7351219351587964,
   'mean_test_rmse': 0.8851577077794598}],
 ['20 3 2',
  {'mean_train_rmse': 0.735233825472174,
   'mean_test_rmse': 0.8824767192739555}],
 ['25 5 2',
  {'mean_train_rmse': 0.7381778248159103,
   'mean_test_rmse': 0.8825719051940573}],
 ['20 5 2',
  {'mean_train_rmse': 0.7382587775642508,
   'mean_test_rmse': 0.8827727847750341}],
 ['15 5 2',
  {'mean_train_rmse': 0.7383803927995208,
   'mean_test_rmse': 0.8817165869468809}],
 ['18 5 2',
  {'mean_train_rmse': 0.7384165546271196,
   'mean_test_rmse': 0.8821348687122809}],
 ['15 8 2',
  {'mean_train_rmse': 0.7426733452579339,
   'mean_test_rmse': 0.8808194209822862}],
 ['20 8 2',
  {'mean_train_rmse': 0.7427204734449324,
   'mean_test_rmse': 0.8809803201560745}]]

The attempt to improve the hyperparameters did not yield any improved results. in fact training data is slightly more overfit and the test RMSE is slightly worse. 
So we can progress using the preferred ALS model based filter as our current optimum model. 

In [40]:
#Let's run a cross validation based on the preferred ALS hyperparamters. 
best_bsl_option = {'method': 'als', 
                   'n_epochs': 15,
                   'reg_u': 8, 
                   'reg_iu': 2}

best_algo = baseline_only.BaselineOnly(bsl_options=best_bsl_option)
best_cv = cross_validate(best_algo, data, measures=['rmse'], cv=5, verbose=True, return_train_measures=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9029  0.9034  0.8868  0.8687  0.8826  0.8889  0.0131  
RMSE (trainset)   0.8067  0.8049  0.8102  0.8146  0.8112  0.8095  0.0034  
Fit time          0.02    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


## Model Predictions and Evaluation

In [41]:
#we now retrain on the whole training set ie training and validation together!
trainset = data.build_full_trainset()
best_algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1a22da8dd8>

In [42]:
# testset = data.construct_testset(test_raw_ratings) #testset is now the test set sample as created at top of page
# predictions = best_algo.test(testset)
# print('Test set accuracy is: ', end=' ')
# accuracy.rmse(predictions)

In [43]:
valset = data.construct_testset(val_raw_ratings)
predictions = best_algo.test(valset)
print('Validation set accuracy is: {}' .format(accuracy.rmse(predictions)))

RMSE: 0.8976
Validation set accuracy is: 0.8975752751252586


Using our original dataframe we can now undertake a quick comparison to see how our model would predict a small sample of from these know values. 

In [44]:
ratings.head(10)

Unnamed: 0,user_ref,business_ref,stars
0,246,68,5
1,146572,68,4
2,211139,68,3
3,260271,68,4
4,491834,68,5
5,1168645,68,4
6,35710,380,5
7,77088,380,4
8,438436,380,5
9,590862,380,4


In [45]:
ratings.shape

(12552, 3)

Now we can call our model and see what we would predict each user would rate for the given businesses. 

In [46]:
for i in range(10):
    prediction = round(best_algo.predict(uid = ratings.user_ref[i], iid = ratings.business_ref[i])[3], 2)
    print(ratings.user_ref[i], '  ', ratings.business_ref[i], '   ', ratings.stars[i], '   ', 
         prediction, '   ', round(abs(ratings.stars[i]-prediction), 2))

246    68     5     3.84     1.16
146572    68     4     4.18     0.18
211139    68     3     3.89     0.89
260271    68     4     3.74     0.26
491834    68     5     3.57     1.43
1168645    68     4     3.67     0.33
35710    380     5     4.1     0.9
77088    380     4     4.07     0.07
438436    380     5     4.21     0.79
590862    380     4     3.86     0.14


### Create table to show accuracy of current collaborative model

In [47]:

pred_df = ratings.copy().head(10)
d = pd.DataFrame()

for i in range(10):
    prediction = round(best_algo.predict(uid = ratings.user_ref[i], iid = ratings.business_ref[i])[3], 2)
    difference = round(abs(ratings.stars[i]-prediction), 2)
#     print(ratings.user_ref[i], '  ', ratings.business_ref[i], '   ', ratings.stars[i], '   ', 
#          prediction, '   ', difference)
    
    temp = pd.DataFrame(
        {
            'Prediction': prediction,
            'Difference': difference
        
        }, 
        index=[0]
    
    
    )
    
    d = pd.concat([d, temp], axis=0)
    
    
    
d = d.reset_index().drop(columns =['index'])

pred_acc = pd.concat([pred_df, d], axis=1)
pred_acc


Unnamed: 0,user_ref,business_ref,stars,Prediction,Difference
0,246,68,5,3.84,1.16
1,146572,68,4,4.18,0.18
2,211139,68,3,3.89,0.89
3,260271,68,4,3.74,0.26
4,491834,68,5,3.57,1.43
5,1168645,68,4,3.67,0.33
6,35710,380,5,4.1,0.9
7,77088,380,4,4.07,0.07
8,438436,380,5,4.21,0.79
9,590862,380,4,3.86,0.14


As expected there is some variance in our predictions and we are ~0.6 of a point out on average across this small sample. On the scale we're working with that's an average of ~15% error on average but in worst case predictions we are ~27% out on the real world value. There is clearly some room for improvement here and we shall investigate further in the next phase of the project. 

## Build our base Recommenders

We'll progress to make a simple recommender based on this model. 
The following recommender may be a little crude for the purposes of recommending restaurants however it is a good kicking off point. It works by asking a new user to rate previous restaurants they may have visited and then offers them a basket of new restaurants that the may wish to visit.

Future iterations of this model will likely ask a new user to rate how they feel about certain aspects of a restaurant that they would like to visits such as 'music', 'outdoor-seating', 'fish', 'chinese', 'cosy' etc etc. This approach together with taking the best rated / most popular restaurants will be one of the potential improvements that could be made. 

In [48]:
#We need to import a separate dataset to help with the next section. the dataframe should have three columns including
#busniness_id, name, categories

In [49]:
restaurants = pd.read_csv('../data/new_restaurants.csv')
# print(restaurants.columns)
restaurants.head(2)

Unnamed: 0.1,Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,business_ref
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Cannonmills,"""5 Canonmills""",Edinburgh,MLN,EH3 5HA,55.962444,-3.197662,4.5,16,1,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,68
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Grassmarket,"""13 George IV Bridge, Old Town""",Edinburgh,EDH,EH1 1EE,55.94796,-3.192143,4.0,13,1,Restaurants;Food;Sandwiches;Coffee & Tea;Delis,380


In [50]:
restaurants.name.loc[1532] #our restaurant names have extra punctuation that needs removing. 

'"Hanedan"'

In [51]:
restaurants.name = restaurants.name.str.replace('[^\w\s]', '')

In [52]:
restaurants.name.loc[1532]

'Hanedan'

In [53]:
rests = restaurants[['business_ref', 'name', 'categories']].replace({';':', '}, regex=True)
rests.head(2)

Unnamed: 0,business_ref,name,categories
0,68,The Bluebird Cafe,"Breakfast & Brunch, Diners, Restaurants, Cafes..."
1,380,Global Deli,"Restaurants, Food, Sandwiches, Coffee & Tea, D..."


In [54]:
def restaurant_rater(df, num, category=None):
    userID = 2011
    ratings_info = []
    
    while num > 0:
        if category:
            restaurant = df[df['categories'].str.contains(category)].sample(1)
        else:
            restaurant = df.sample(1)
        print(restaurant)
        rating = input("how do you rate this restaurant on a scale of 1-5, press n if you don't know:\n")
        
        if rating == 'n':
            continue
        else: 
            rating_one_restaurant = {'user_ref': userID, 'business_ref':restaurant['business_ref'].values[0],
                                     'stars': rating}
            ratings_info.append(rating_one_restaurant)
            num-=1
    return ratings_info

user_rating = restaurant_rater(rests, 5, 'Italian')

     business_ref           name            categories
880         98319  Tony Macaroni  Italian, Restaurants
how do you rate this restaurant on a scale of 1-5, press n if you don't know:
3
     business_ref           name                   categories
538         58081  Pizza Express  Restaurants, Italian, Pizza
how do you rate this restaurant on a scale of 1-5, press n if you don't know:
2
      business_ref       name                                  categories
1598        174041  Pizza Hut  Restaurants, Pizza, Chicken Wings, Italian
how do you rate this restaurant on a scale of 1-5, press n if you don't know:
2
     business_ref             name                      categories
896         99417  The Castle Cafe  Italian, Restaurants, Scottish
how do you rate this restaurant on a scale of 1-5, press n if you don't know:
4
      business_ref            name            categories
1494        162231  Antonios Pizza  Restaurants, Italian
how do you rate this restaurant on a scale of 1-5,

In [55]:
new_ratings_df = ratings.append(user_rating, ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df, reader)

In [88]:
# svd_ = SVD(n_factors=5, reg_all=0.02)
# svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a1da43710>

In [56]:
best_algo.fit(new_data.build_full_trainset())

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1a22da8dd8>

In [57]:
list_of_restaurants = []
for rest in ratings['business_ref'].unique():
    list_of_restaurants.append((rest, best_algo.predict(2011,rest)[3]))

In [58]:
ranked_recommended = sorted(list_of_restaurants, key=lambda x:x[1], reverse=True)
ranked_recommended[:10]

[(49357, 4.135841204466745),
 (85492, 4.108243208390912),
 (166894, 4.1059199729646085),
 (99404, 4.084869516136566),
 (10769, 4.0409355647297),
 (157250, 4.022618660147958),
 (70513, 4.021609647691692),
 (124483, 4.015579103869256),
 (134710, 4.00556951944482),
 (59942, 3.9914605530472445)]

And now we want to be able to return the actual restaurant names not just the id's from the ratings df. We'll write a function to extract this information. 

In [59]:
# return the top n recommendations 
def restaurant_recommender(user_ratings, restaurant_name_df, n):
    '''
    function returns the top recommended restaurants
    '''
    for index, rec in enumerate(user_ratings):
        name = restaurant_name_df.loc[restaurant_name_df['business_ref'] == int(rec[0])]['name']
        print('Recommendation # ', index+1, ': ', name, '\n')
        n-=1
        if n == 0:
            break
restaurant_recommender(ranked_recommended, rests, 10)

Recommendation #  1 :  450    Martin Wishart
Name: name, dtype: object 

Recommendation #  2 :  773    Field
Name: name, dtype: object 

Recommendation #  3 :  1532    Hanedan
Name: name, dtype: object 

Recommendation #  4 :  895    The Scotch Malt Whisky Society
Name: name, dtype: object 

Recommendation #  5 :  95    Royal Mile
Name: name, dtype: object 

Recommendation #  6 :  1448    Patisserie Madeleine
Name: name, dtype: object 

Recommendation #  7 :  653    Hotel Chocolat Cafe
Name: name, dtype: object 

Recommendation #  8 :  1151    Oink
Name: name, dtype: object 

Recommendation #  9 :  1253    Noor Indian Takeaway
Name: name, dtype: object 

Recommendation #  10 :  563    Blackwoods Bar  Grill
Name: name, dtype: object 



After a number of runs we are consistently being returned the same or similar list of recommendation. We suspect that only asking for five ratings is not having enough impact in the sparse matrix and we are simply being recommended the highest rated and or most popular restaurants in the area. Let's test this theory....

In [61]:
#We're getting similar recommendations for every run - let's look at the individual restaurants
restaurants[restaurants['name'].str.match('Hanedan')]

Unnamed: 0.1,Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,business_ref
1532,166894,WAMesuyxdmL3SRigsxlXng,Hanedan,Newington,"""41-42 West Preston Street""",Edinburgh,EDH,EH8 9PY,55.938891,-3.181186,5.0,57,1,Turkish;Restaurants,166894


So Hanedan has an average 5 stars and has a high review count in relation to the whole set. This implies our theory might be relevant so let's check all the restaurants in the top ranked set.

In [62]:
#extract the business refs in ranked_recommended
ranked_refs = [x[0] for x in ranked_recommended[:10]]
ranked_refs

[49357, 85492, 166894, 99404, 10769, 157250, 70513, 124483, 134710, 59942]

In [63]:
#Return the names associated with these refs into a list
rec_list = []
for index, row in rests.iterrows():
    if row['business_ref'] in ranked_refs:
#         print(row['name'])
        rec_list.append(row['name'])
rec_list

['Royal Mile',
 'Martin Wishart',
 'Blackwoods Bar  Grill',
 'Hotel Chocolat Cafe',
 'Field',
 'The Scotch Malt Whisky Society',
 'Oink',
 'Noor Indian Takeaway',
 'Patisserie Madeleine',
 'Hanedan']

In [64]:
for i in rec_list:
    for index, row in restaurants.iterrows():
        if row['name'] == i:
            print(row['name'], 'has', row['stars'], 'stars and', row['review_count'], 'reviews')
            

Royal Mile has 4.5 stars and 99 reviews
Martin Wishart has 5.0 stars and 24 reviews
Blackwoods Bar  Grill has 4.5 stars and 12 reviews
Hotel Chocolat Cafe has 4.5 stars and 45 reviews
Field has 4.5 stars and 41 reviews
The Scotch Malt Whisky Society has 5.0 stars and 26 reviews
The Scotch Malt Whisky Society has 4.5 stars and 26 reviews
Oink has 4.5 stars and 32 reviews
Oink has 4.5 stars and 238 reviews
Noor Indian Takeaway has 4.5 stars and 40 reviews
Patisserie Madeleine has 5.0 stars and 21 reviews
Hanedan has 5.0 stars and 57 reviews


We can see as suspected that all the recommended restaurants have a very high average rating and also high relative review counts. When recommendations are only based on a small user review set our collaborative recommender will normally just return the best restaurants in Edinburgh. This is currently operating as a basic cold start recommender. 

## Content Based Models
### Get most similar restaurants based on categorical and review information

In [65]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re

In [66]:
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel

In [67]:
df = pd.read_csv('../data/content.csv')
print(df.shape)
df.head()

(1605, 6)


Unnamed: 0.1,Unnamed: 0,business_id,name,city,categories,text
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,"Breakfast & Brunch, Diners, Restaurants, Cafes...",When Blythe told me he'd checked out a new spo...
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Edinburgh,"Restaurants, Food, Sandwiches, Coffee & Tea, D...",Global Deli is a great find if you're feeling ...
2,397,Di5ApLgoQpcv5Aew82fI_A,"""The Rendezvous""",Edinburgh,"Restaurants, Cantonese, Chinese",Not been to this restaurant for about 2 years ...
3,420,OvbLKXkJCg8ZMHX9L5faIA,"""Bread Meats Bread""",Edinburgh,"Burgers, Restaurants",I know people rave about this place so I'm sur...
4,446,T2jfXhvQPk9wLdt1OVV-Kg,"""Rose Street Brewery""",Edinburgh,"Pubs, Whiskey Bars, Nightlife, Breakfast & Bru...",One of many spots on Rose St. A good variety o...


For our minimum viable product we will utilise the categorical data to find similarities between restaurants. 

In [68]:
df['categories'] = df['categories'].str.lower()
df['categories'] = df['categories'].apply(lambda x: x.strip())
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,city,categories,text
0,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,"breakfast & brunch, diners, restaurants, cafes...",When Blythe told me he'd checked out a new spo...
1,380,inaACfObL1NBNJmBG11iuQ,"""Global Deli""",Edinburgh,"restaurants, food, sandwiches, coffee & tea, d...",Global Deli is a great find if you're feeling ...
2,397,Di5ApLgoQpcv5Aew82fI_A,"""The Rendezvous""",Edinburgh,"restaurants, cantonese, chinese",Not been to this restaurant for about 2 years ...
3,420,OvbLKXkJCg8ZMHX9L5faIA,"""Bread Meats Bread""",Edinburgh,"burgers, restaurants",I know people rave about this place so I'm sur...
4,446,T2jfXhvQPk9wLdt1OVV-Kg,"""Rose Street Brewery""",Edinburgh,"pubs, whiskey bars, nightlife, breakfast & bru...",One of many spots on Rose St. A good variety o...


In [69]:
df.name.loc[302] #too much punctuation in the string

'"Crazy Ivans"'

In [70]:
df.name = df.name.str.replace('[^\w\s]', '') #Remove problematic speech marks
df.name.loc[302]

'Crazy Ivans'

We know from the restauranty EDA that there are some high frequency category words which we don't want for modelling purposes. This is because they don't tell us anything specific about the establishment.\
The words we'll remove on this initial run are 'Restaurant' & 'Food'. We have to assume that the establishments in question are 'restaurants' as that's a condition of entry into the dataset and 'food' as that's a condition of being a restaurant! 

In [71]:
#create required stop words list
stopwords = nltk.corpus.stopwords.words('english')
add_stopwords = ['restaurants', 'food']
stopwords.extend(add_stopwords)

In [72]:
df.isna().sum() #double check no NANS

Unnamed: 0     0
business_id    0
name           0
city           0
categories     0
text           0
dtype: int64

Create an instance of the TFidFVectorizer class and pass in parameters. 

In [73]:
tfv = TfidfVectorizer(min_df=3, max_df=1600, max_features=None, strip_accents='unicode', 
                    analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1,2), stop_words=stopwords)

Fit and transform the instance on the restaurant's categories

In [74]:

tfv_matrix = tfv.fit_transform(df['categories'])

In [75]:
print(tfv_matrix.shape)
tfv_matrix

(1605, 396)


<1605x396 sparse matrix of type '<class 'numpy.float64'>'
	with 7457 stored elements in Compressed Sparse Row format>

Use our pairwise algorithm to compare the every element of the sparse matrix with every other element of the same sparse matrix.

In [76]:
lin = linear_kernel(tfv_matrix, tfv_matrix)

In [77]:
lin[0:5]

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.12050794],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.21763582, 0.        , 0.        , ..., 0.06295493, 0.        ,
        0.        ]])

Now we'll build a basic recommender

In [78]:
indices = pd.Series(df.index, index=df['name']).drop_duplicates()

In [79]:
indices[300:320]

name
Antojito Cantina                   300
Lazeez Tandoori                    301
Crazy Ivans                        302
Dean Gallery Café                  303
Joseph Pearce                      304
Cafe Cassis                        305
Serrano Manchego                   306
Pierinos Take Away Food Shops      307
Broughton Delicatessen and Café    308
Lazy Lohans                        309
On Tap Medina                      310
Zazou Cruises                      311
Spirit of Thai                     312
Michelles Place                    313
Frankie  Bennys                    314
Jackson Restaurant                 315
Zizzi                              316
Buckstone Pub  Kitchen             317
Dolphin Fish Bar                   318
Usquabae                           319
dtype: int64

In [80]:
indices['Lazy Lohans'] #Check the correct index is returned

309

In [81]:
def recommender(name, lin=lin):
    """
    This function takes a restaurant title as an argument and 
    returns the n=5 most similar resstaurants based on the content
    of the restaurant categories and the limnear similarity scores.  
    
    """
    
    #First we get the index corresponding to the argument / original title
    index = indices[name]
    
    #Then we fetch all the linear similarity scores for the pairwise comparisons 
    lin_scores = list(enumerate(lin[index]))
    
    #Now we sort the scores so the top recommended restaurants are at the top
    sorted_lin_scores = sorted(lin_scores, key=lambda x: x[1], reverse=True)
    
    #give us a list equal to n=5
    top_lin_scores = sorted_lin_scores[1:6]
    
    #Identify the restaurant indices corresponding to the above list
    restaurant_index = [i[0] for i in top_lin_scores]
    
    #Now find the title names at these idices and return them as our recommendations!!
    return df['name'].iloc[restaurant_index]

In [82]:
#We call the recommender function with a restaurant that we want similar hits for!
recommender('Martin Wishart') 

72        La Garrigue Bistro
205           La PTite Folie
299    Water of Leith Bistro
324           Le Mouton Noir
414              La Garrigue
Name: name, dtype: object

## Hybrid Recommender - under construction 

In [None]:
######### For info only - DELETE THIS CELL ----- DELETE THIS CELL --------
def restaurant_rater(df, num, category=None):
    userID = 2011
    ratings_info = []
    
    while num > 0:
        if category:
            restaurant = df[df['categories'].str.contains(category)].sample(1)
        else:
            restaurant = df.sample(1)
        print(restaurant)
        rating = input("how do you rate this restaurant on a scale of 1-5, press n if you don't know:\n")
        
        if rating == 'n':
            continue
        else: 
            rating_one_restaurant = {'user_ref': userID, 'business_ref':restaurant['business_ref'].values[0],
                                     'stars': rating}
            ratings_info.append(rating_one_restaurant)
            num-=1
    return ratings_info

user_rating = restaurant_rater(rests, 5, 'Italian')

In [135]:
def get_int():
    userdata = input("enter an int, or 'q' to quit:")
    if userdata=='q':
        return None
    try:
        user_num = int(userdata)
        return user_num
    except ValueError:
        print("I need an integer to contiue.")
        return(get_int())
get_int()

enter an int, or 'q' to quit:w
I need an integer to contiue.
enter an int, or 'q' to quit:g
I need an integer to contiue.
enter an int, or 'q' to quit:q


In [160]:
def hybrid_rec():
    existing = input('Are you an existing user? y/n? Hit any other key to quit!!') #Ask initial question
    
    if existing == 'y': #if input is y
        user_ref = input('What is your user reference?') #Ask what their user ref is?
        try:
            user_ref = int(user_ref) #input should be converted to integer
        except ValueError:
            print("I need an integer to continue")  #if input cannot be integer print statement and loop to start
            return hybrid_rec()
        if int(user_ref) in ratings['user_ref'].values: #is user _ref in the ratings dataset
                print('yeh you been found!')
                
                #How many reviews does this user have
                user_count = ratings.loc[ratings['user_ref'] == user_ref].shape[0] 
                print(user_count)
                if user_count > 10: #Set size of user count to progress through recommender. 
                    print('hmm you active')
                else:
                    print('you don"t have enough ratings')
            
            
            
            
        else:
            print('nah we can"t find you')
            
    return None
  
            

hybrid_rec()

Are you an existing user? y/n? Hit any other key to quit!!y
What is your user reference?246
yeh you been found!
74
hmm you active


In [131]:
ratings.head()

Unnamed: 0,user_ref,business_ref,stars
0,246,68,5
1,146572,68,4
2,211139,68,3
3,260271,68,4
4,491834,68,5


In [150]:
ratings.query('user_ref == 246').user_ref.count() 

74

In [156]:
ratings.loc[ratings['user_ref'] == 246].shape[0]

74

In [226]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [233]:
hybrid_df = df.copy()
hybrid_df.head(2)

Unnamed: 0.1,Unnamed: 0,business_ref,business_id,name,city,categories,review_id,user_id,stars,user_ref
0,1900,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,b31UZTy2TvnFtkfygJG40Q,bcxcQhp0sKYd9eUnEVUzPA,5,246
1,1901,68,F31RycVVooeIOp9jsXmg6g,"""The Bluebird Cafe""",Edinburgh,Breakfast & Brunch;Diners;Restaurants;Cafes;Br...,jYxWLyWrWy8dJFQs9DEuEg,RFxjYeLW_aYLdVW3PBwFNg,4,146572


In [238]:
indices_map = hybrid_df.set_index('user_ref')

In [239]:
#arguments, userID == user_ref, name == restaurant_name
def hybrid_recommender(userID, name):
    indices_ = pd.Series(restaurants.index, index=restaurants['name'])
    idx = indices_[name] #get index of the title
    user_id = hybrid_df.loc[name]['user_ref'] # get user id of the review??
    restaurant_id = id_map.loc[name]['business_ref'] #get business id of the review
    
    sim_scores = list(enumerate(lin[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:30]
    restaurant_idx = [i[0] for i in sim_scores]
    
    restaurant = restaurants.iloc[restaurant_idx][['name', 'review_count', 'stars', 'business_ref', 'postal_code']]
#     restaurant['pred'] = restaurant['id'].apply(lambda x: svd.predict(userID, indices_map.loc[x]['business_ref']).est)
    
    restaurant = restaurant.sort_vsalues('est', ascending=False)
    return restaurant.head(10)
    

In [None]:
### Hybrid recommender under construction! 