In [31]:
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
%matplotlib notebook

import ast

#To Ignore Warnings in Output
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load in the data
business = pd.read_csv('../data/output_csv/business_PA_Philly_clean.csv')
review = pd.read_csv('../data/output_csv/review_PA_Philly_clean.csv')

In [3]:
business.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count,categories,...,Seafood Markets,Wraps,Shaved Ice,Cupcakes,Greek,Flowers & Gifts,Home & Garden,French,Candy Stores,Chocolatiers & Shops
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245,"Sushi Bars, Restaurants, Japanese",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205,"Korean, Restaurants",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65,"Eatertainment, Arts & Entertainment, Brewpubs,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56,"Restaurants, Automotive, Delis, Gas Stations, ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
review = review.drop(columns = ['review_id','year'])
review.head()

Unnamed: 0,user_id,business_id,stars
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
1,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5
2,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5
3,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5
4,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4


In [5]:
business = business[['business_id','name']]

In [10]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype


user_u = list(sorted(review.user_id.unique()))
business_u = list(sorted(review.business_id.unique()))

cat_type_user = CategoricalDtype(categories=user_u, ordered=True)
cat_type_business = CategoricalDtype(categories=business_u, ordered=True)

row = review.user_id.astype(cat_type_user).cat.codes
col = review.business_id.astype(cat_type_business).cat.codes

data = review['stars'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(business_u)))


In [11]:
ratings = pd.DataFrame.sparse.from_spmatrix(sparse_matrix,index=user_u, columns=business_u)

In [12]:
ratings.fillna(0, inplace = True)

# SVD

##### With default values

In [22]:
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

# Step 1: Set up the reader class
#reader = Reader(rating_scale=(1,5))

reader = Reader(line_format='user item rating', rating_scale=(1, 5))


# Step 2: Load the dataframe. Use the merged data from above (not the pivoted data)
data = Dataset.load_from_df(review, reader)

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1720  1.1796  1.1810  1.1716  1.1752  1.1759  0.0038  
MAE (testset)     0.9226  0.9293  0.9289  0.9228  0.9257  0.9258  0.0029  
Fit time          7.14    7.35    7.49    6.99    6.86    7.16    0.23    
Test time         0.98    1.20    0.96    1.00    1.12    1.05    0.09    


{'test_rmse': array([1.17203127, 1.17964524, 1.18096469, 1.17157833, 1.17520916]),
 'test_mae': array([0.92260089, 0.92927376, 0.92894053, 0.92277226, 0.92565721]),
 'fit_time': (7.136639356613159,
  7.350068807601929,
  7.490508317947388,
  6.986602544784546,
  6.860138177871704),
 'test_time': (0.9798271656036377,
  1.199723720550537,
  0.9634115695953369,
  1.002535343170166,
  1.1178758144378662)}

##### With different parameters

In [7]:
# Build the train set
svd_data = data.build_full_trainset()


In [8]:
# Set up the model and fit the model. Note it will take a few minutes to run
svd = SVD(n_factors = 7, lr_all = 0.01, reg_all = 0.1, biased = True,verbose = True, random_state = 862)
svd.fit(svd_data)


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23331244880>

In [13]:

UID = '--2tyArRmSoyKx5r-FVG0A'
# First we need to obtain the ids of the unvisited restaurants. 
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]

# Now we will loop over the restaurants to extract the predictions
svd_rec = []
for iid in unread_ids:
    svd_rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [14]:
# Recommendations
svd_pp = []
for i in range(10):
    rec_rest_id = svd_rec.index[i]
    svd_pp.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", svd_rec.iloc[i])

my number  1  recommendation is  Dan's Fresh Meats , with a predicted rating of 4.880313404372395
my number  2  recommendation is  Taco & Ramen , with a predicted rating of 4.815003967502807
my number  3  recommendation is  El Molino Tortilleria and Restaurant , with a predicted rating of 4.811596904963732
my number  4  recommendation is  Safa Persian Teahouse , with a predicted rating of 4.788428484102862
my number  5  recommendation is  Cafe Mi Quang , with a predicted rating of 4.778626685295353
my number  6  recommendation is  Haggerty Deli , with a predicted rating of 4.778290717152441
my number  7  recommendation is  Philly Foodworks , with a predicted rating of 4.766924330272553
my number  8  recommendation is  Otolith Sustainable Seafood , with a predicted rating of 4.764872132987969
my number  9  recommendation is  Ramona Susan's Bake Shop , with a predicted rating of 4.744659622777604
my number  10  recommendation is  Veghada , with a predicted rating of 4.740254916088864


# Cross Validation

In [25]:
def rmse_vs_factors(algorithm, data):
  """
  Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  factor k in range(1, 26, 1)
  25 values 
  Arg:  i.) algorithm = Matrix factoization algorithm, e.g SVD/NMF/PMF, ii.)  data = surprise.dataset.DatasetAutoFolds
  
  """
  rmse_algorithm = []
  
  for k in range(1, 26, 1):
    algo = algorithm(n_factors = k)
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

In [26]:
rmse_svd = rmse_vs_factors(SVD,data)


In [32]:
def plot_rmse(rmse, algorithm):
    plt.figure(num=None, figsize=(11, 5), dpi=80, facecolor='w', edgecolor='k')
    plt.subplot(2,1,1)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.title("{0} Performance: RMSE Against Number of Factors".format(algorithm), size = 20 )
    plt.ylabel("Mean RMSE (cv=5)")
    
    plt.subplot(2,1,2)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.xticks(np.arange(0, 25, step=2))
    plt.xlabel("{0}(n_factor = k)".format(algorithm))
    plt.ylabel("Mean RMSE (cv=5)")
    plt.axvline(np.argmin(rmse), color = "r")
    print("Best k seems to be : ",np.argmin(rmse))

In [33]:
plot_rmse(rmse_svd,"SVD")


<IPython.core.display.Javascript object>

Best k seems to be :  1


# Fitting model with Best K

In [34]:
# Set up the model and fit the model. Note it will take a few minutes to run
svd = SVD(n_factors = 1, lr_all = 0.01, reg_all = 0.1, biased = True,verbose = True, random_state = 862)
svd.fit(svd_data)


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2334f621580>

In [35]:

UID = '--2tyArRmSoyKx5r-FVG0A'
# First we need to obtain the ids of the unvisited restaurants. 
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]

# Now we will loop over the restaurants to extract the predictions
svd_rec = []
for iid in unread_ids:
    svd_rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [36]:
# Recommendations
svd_pp = []
for i in range(10):
    rec_rest_id = svd_rec.index[i]
    svd_pp.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", svd_rec.iloc[i])

my number  1  recommendation is  Dan's Fresh Meats , with a predicted rating of 4.870891435146075
my number  2  recommendation is  Taco & Ramen , with a predicted rating of 4.819736556164243
my number  3  recommendation is  El Molino Tortilleria and Restaurant , with a predicted rating of 4.8145831390620994
my number  4  recommendation is  Safa Persian Teahouse , with a predicted rating of 4.785088114866177
my number  5  recommendation is  Otolith Sustainable Seafood , with a predicted rating of 4.785043445902891
my number  6  recommendation is  Haggerty Deli , with a predicted rating of 4.781540372695061
my number  7  recommendation is  Cafe Mi Quang , with a predicted rating of 4.777472702177596
my number  8  recommendation is  Philly Foodworks , with a predicted rating of 4.777319357421635
my number  9  recommendation is  Capriotti Bros , with a predicted rating of 4.754843685752202
my number  10  recommendation is  Alena’s Cafe , with a predicted rating of 4.752786091794193
