In [1]:
import pandas as pd
import numpy as np
import ast

#To Ignore Warnings in Output
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load in the data
business = pd.read_csv('../data/output_csv/business_PA_Philly_clean.csv')
review = pd.read_csv('../data/output_csv/review_PA_Philly_clean.csv')

In [3]:
business.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count,categories,...,Seafood Markets,Wraps,Shaved Ice,Cupcakes,Greek,Flowers & Gifts,Home & Garden,French,Candy Stores,Chocolatiers & Shops
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245,"Sushi Bars, Restaurants, Japanese",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205,"Korean, Restaurants",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65,"Eatertainment, Arts & Entertainment, Brewpubs,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56,"Restaurants, Automotive, Delis, Gas Stations, ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
review = review.drop(columns = ['review_id','year'])
review.head()

Unnamed: 0,user_id,business_id,stars
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
1,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5
2,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5
3,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5
4,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4


In [5]:
business = business[['business_id','name']]

In [None]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype


user_u = list(sorted(review.user_id.unique()))
business_u = list(sorted(review.business_id.unique()))

cat_type_user = CategoricalDtype(categories=user_u, ordered=True)
cat_type_business = CategoricalDtype(categories=business_u, ordered=True)

row = review.user_id.astype(cat_type_user).cat.codes
col = review.business_id.astype(cat_type_business).cat.codes

data = review['stars'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(business_u)))


In [None]:
ratings = pd.DataFrame.sparse.from_spmatrix(sparse_matrix,index=user_u, columns=business_u)

# Recommendation Engine - 2

In [None]:
ratings.fillna(0, inplace = True)

In [None]:
def matrix_factorization(R, P, Q, K, steps=10, alpha=0.0002, beta=0.02):
    '''
    Inputs:
    R     : The ratings (of dimension M x N)
    P     : an initial matrix of dimension M x K
    Q     : an initial matrix of dimension N x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimization
    alpha : the learning rate
    beta  : the regularization parameter

    Outputs:
    the final matrices P and Q
    '''

    for step in range(steps):
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0: # Skipping over missing ratings
                    #Calculating error
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        # calculate gradient with alpha and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001: # tolerance
            break
    print(e)
    return P, Q

In [None]:
len(rating_np[0])

In [None]:
rating_np.shape[1]

In [None]:
# Matrix Factorization

In [None]:
np.random.seed(862)

# Initializations
M = ratings.shape[0] # Number of users
N = ratings.shape[1] # Number of items
K = 3 # Number of latent features

# Initial estimate of P and Q
P = np.random.rand(M,K)
Q = np.random.rand(K,N)
rating_np = np.array(ratings)

In [None]:
P, Q = matrix_factorization(rating_np, P, Q, K)

In [None]:
predicted_rating = np.matmul(P, Q)
predicted_rating = pd.DataFrame(predicted_rating, index = ratings.index, columns = ratings.columns)
predicted_rating.head()

In [None]:
UID = '--2tyArRmSoyKx5r-FVG0A'

In [None]:
# Obtain the missing ratings
missing_ratings = predicted_rating.loc[UID][ratings.loc[UID,:]==0]

# Attach it with indices
missing_ratings = pd.Series(missing_ratings, index = ratings.columns[ratings.loc[UID,:] == 0] )

# Sort the ratings
missing_ratings.sort_values(ascending = False, inplace = True)

In [None]:
 
    
# Recommendations
mat_fact = []
for i in range(10):
    rec_rest_id = missing_ratings.index[i]
    mat_fact.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", missing_ratings.iloc[i])
    
    


In [None]:
mat_fact

# SVD

In [None]:
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

# Step 1: Set up the reader class
reader = Reader(rating_scale=(1,5))

# Step 2: Load the dataframe. Use the merged data from above (not the pivoted data)
data = Dataset.load_from_df(review, reader)

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
# Load the libraries
from surprise import Reader
from surprise import Dataset
from surprise.prediction_algorithms.matrix_factorization import SVD


In [None]:
# Step 1: Set up the reader class
reader = Reader(rating_scale=(1,5))


In [None]:
# Step 2: Load the dataframe. Use the merged data from above (not the pivoted data)
data = Dataset.load_from_df(review[['user_id', 'business_id', 'stars']], reader)


In [None]:
# Step 3: Build the train set
svd_data = data.build_full_trainset()


In [None]:
# Use as many boxes as you need
# Set up the model and fit the model. Note it will take a few minutes to run
svd = SVD(n_factors = 7, lr_all = 0.01, reg_all = 0.1, biased = True,verbose = True, random_state = 862)
svd.fit(svd_data)


In [None]:
# Use as many boxes as you need
# I will use the predict function (because why not)
# First we need to obtain the ids of the unread books. 
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]

# Now we will loop over the books to extract the predictions
svd_rec = []
for iid in unread_ids:
    svd_rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [None]:
# Recommendations
svd_pp = []
for i in range(10):
    rec_rest_id = svd_rec.index[i]
    svd_pp.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", svd_rec.iloc[i])

In [None]:
def rmse_vs_factors(algorithm, data):
  """
  Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  factor k in range(1, 26, 1)
  25 values 
  Arg:  i.) algorithm = Matrix factoization algorithm, e.g SVD/NMF/PMF, ii.)  data = surprise.dataset.DatasetAutoFolds
  
  """
  rmse_algorithm = []
  
  for k in range(1, 26, 1):
    algo = algorithm(n_factors = k)
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

In [None]:
rmse_svd = rmse_vs_factors(SVD,data)


In [None]:
def plot_rmse(rmse, algorithm):
    plt.figure(num=None, figsize=(11, 5), dpi=80, facecolor='w', edgecolor='k')
    plt.subplot(2,1,1)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.title("{0} Performance: RMSE Against Number of Factors".format(algorithm), size = 20 )
    plt.ylabel("Mean RMSE (cv=5)")
    
    plt.subplot(2,1,2)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.xticks(np.arange(0, 25, step=2))
    plt.xlabel("{0}(n_factor = k)".format(algorithm))
    plt.ylabel("Mean RMSE (cv=5)")
    plt.axvline(np.argmin(rmse), color = "r")
    print("Best k seems to be : ",np.argmin(rmse))

In [None]:
plot_rmse(rmse_svd,"SVD")


In [29]:
review.head()

Unnamed: 0,user_id,business_id,stars
0,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5
1,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5
2,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5
3,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5
4,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4


In [None]:
review = review.drop(columns = ['review_id','year'])

# Surprise Stuff

In [6]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [7]:
min_rest_ratings = 30
filter_rests = review['business_id'].value_counts() > min_rest_ratings
filter_rests = filter_rests[filter_rests].index.tolist()

min_user_ratings = 30
filter_users = review['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = review[(review['business_id'].isin(filter_rests)) & (review['user_id'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(review.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(547794, 3)
The new data frame shape:	(114883, 3)


In [8]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_new, reader)

In [9]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(),SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.915957,0.161492,0.199299
SVDpp,0.919151,6.930067,3.141933
SVD,0.92013,1.077819,0.318408
KNNBaseline,0.924471,0.526574,4.760497
KNNWithMeans,0.932401,0.421876,4.159331
KNNWithZScore,0.93839,0.483447,4.39217
SlopeOne,0.938846,0.357008,1.723923
CoClustering,0.947897,2.068408,0.258467
KNNBasic,0.960393,0.365545,3.787997
NMF,0.970171,1.705531,0.226608


In [10]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')


In [11]:
surprise_results


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.915957,0.161492,0.199299
SVDpp,0.919151,6.930067,3.141933
SVD,0.92013,1.077819,0.318408
KNNBaseline,0.924471,0.526574,4.760497
KNNWithMeans,0.932401,0.421876,4.159331
KNNWithZScore,0.93839,0.483447,4.39217
SlopeOne,0.938846,0.357008,1.723923
CoClustering,0.947897,2.068408,0.258467
KNNBasic,0.960393,0.365545,3.787997
NMF,0.970171,1.705531,0.226608


In [12]:
#Base Line Only has the best test rmse
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.90672926, 0.90386435, 0.90944351, 0.91071274, 0.91640746]),
 'fit_time': (0.10354876518249512,
  0.1466827392578125,
  0.12339591979980469,
  0.1122279167175293,
  0.10933065414428711),
 'test_time': (0.06787633895874023,
  0.13570380210876465,
  0.14267635345458984,
  0.07419872283935547,
  0.1298670768737793)}

In [82]:
from sklearn.model_selection import GridSearchCV

param_grid = {'reg': [0.1, 0.2],
'learning_rate': [0.1, 0.2],
'n_epochs': [5, 10]}

algo = BaselineOnly()
gs = GridSearchCV(algo, param_grid,  scoring="accuracy",cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse']) 
print(gs.best_params['rmse'])
#Assigning values
t = gs.best_params
factors= t['rmse']['n_factors']
epochs = t['rmse']['n_epochs'] 
lr_value = t['rmse']['lr_all']
reg_value = t['rmse']['reg_all']

TypeError: Singleton array array(<surprise.dataset.DatasetAutoFolds object at 0x000001CFB3077700>,
      dtype=object) cannot be considered a valid collection.

In [13]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)


Estimating biases using als...
RMSE: 0.9105


0.9105073011204752

In [14]:
predictions

[Prediction(uid='6kJFLAHV-tNsBEZaRTqEWQ', iid='S8ZFYEgMejpChID8tzKo9A', r_ui=4.0, est=5, details={'was_impossible': False}),
 Prediction(uid='_b7sWPWtHFHMrIaq_4PCHQ', iid='h-cUfr_s3U7XP2DpPkoJqg', r_ui=2.0, est=3.644688951330027, details={'was_impossible': False}),
 Prediction(uid='27M0B4ENG7sI-Z2gCuqANA', iid='pXRrRf8fDv6yU3xp1E25hA', r_ui=4.0, est=4.08912816482058, details={'was_impossible': False}),
 Prediction(uid='TLNI1VHqpKDz8W7xQhIbAA', iid='Oun4NN-u5yiHIxDqtJnxgA', r_ui=4.0, est=4.207958647136959, details={'was_impossible': False}),
 Prediction(uid='h500Ce4x1x_hpm_Kzl3FJw', iid='tMtI6ECD6hwM-nFp2kXLKQ', r_ui=3.0, est=4.269467086139972, details={'was_impossible': False}),
 Prediction(uid='VbngpjySUH5A7GbxuM6S3A', iid='VSDdUwXktnvL_O0meKv7MQ', r_ui=5.0, est=4.429404490037369, details={'was_impossible': False}),
 Prediction(uid='ei3O4gS4YaEfQW2vpDdYkA', iid='uAJ_rq4ZQUgERalFBO_zlQ', r_ui=3.0, est=3.911508749835982, details={'was_impossible': False}),
 Prediction(uid='0MJ5sKX5uq7Ma

In [15]:
trainset = algo.trainset
print(algo.__class__.__name__)

BaselineOnly


In [16]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)


In [51]:
df[df.uid == UID]

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
3926,IQsF3Rc6IgCzjVV9DE8KXg,wB1Tin0OW1JRpaKM-E3ZYA,3.0,3.567675,{'was_impossible': False},30,145,0.567675
6755,IQsF3Rc6IgCzjVV9DE8KXg,05ev984NYfimRN0UiFrxaA,3.0,3.960472,{'was_impossible': False},30,170,0.960472
8501,IQsF3Rc6IgCzjVV9DE8KXg,DZglT5FP5VNDQJU3PuRIHw,2.0,3.886144,{'was_impossible': False},30,136,1.886144
8885,IQsF3Rc6IgCzjVV9DE8KXg,Mwc3n5Psw9wRaQ22vZWDYQ,3.0,4.049124,{'was_impossible': False},30,90,1.049124
10712,IQsF3Rc6IgCzjVV9DE8KXg,WP9GAuhvmUhm8MAxMqhgrQ,2.0,3.412495,{'was_impossible': False},30,39,1.412495
19695,IQsF3Rc6IgCzjVV9DE8KXg,zpKTPWoW56wF6d9qNnxM3Q,5.0,4.021739,{'was_impossible': False},30,94,0.978261
25698,IQsF3Rc6IgCzjVV9DE8KXg,gwxNttwUgmOn3vsHYe0pDQ,4.0,3.678582,{'was_impossible': False},30,72,0.321418
27270,IQsF3Rc6IgCzjVV9DE8KXg,9o55tW2eCwxRvwWVQSzj5g,2.0,3.295161,{'was_impossible': False},30,36,1.295161
28692,IQsF3Rc6IgCzjVV9DE8KXg,KHQXmUFiAD0FHvrMSakJBA,4.0,3.611555,{'was_impossible': False},30,82,0.388445


In [30]:
UID = 'aFa96pz67TwOFu4Weq5Agg'

In [18]:
from collections import defaultdict

def get_all_predictions(predictions):
    
    # First map the predictions to each user.
    top_n = defaultdict(list)    
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
    return top_n

In [19]:
all_pred = get_all_predictions(predictions)


In [40]:
n=11
for uid, user_ratings in all_pred.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    all_pred[uid] = user_ratings[:n]

In [41]:
all_pred

defaultdict(list,
            {'6kJFLAHV-tNsBEZaRTqEWQ': [('S8ZFYEgMejpChID8tzKo9A', 5),
              ('atZ_olNKXOG4rEr6mccN8g', 5),
              ('gQCKrqkalAc--4cuW_C_Ow', 4.915480248696444),
              ('j-qtdD55OLfSqfsWuQTDJg', 4.894997682336549),
              ('CYSPKiVdoPX3erovujnE9Q', 4.830498777689558),
              ('AaTpjyw-EiODgi3tR4Xr-g', 4.830233191609262),
              ('uBDXcXlLR9IuRV1N2m0SPQ', 4.781747532366227),
              ('uBDXcXlLR9IuRV1N2m0SPQ', 4.781747532366227),
              ('sL6fC0P4C-gyL4E5gacUeQ', 4.706880927417315),
              ('Mt6XTQfwk2DUD3AK29YIBg', 4.6978173271576695)],
             '_b7sWPWtHFHMrIaq_4PCHQ': [('vUrTGX_7HxqeoQ_6QCVz6g',
               4.4072915213529695),
              ('F_p-pLonAAzzOSnTLdJMtw', 4.347184294647976),
              ('i_FWONQD1ZBqrNE2b-M5Ug', 4.324606875541274),
              ('JJ6tYuw5Ms_r58DhnMtOCA', 4.314140526454412),
              ('cbr2Db20L6P50CsxsX6c2A', 4.248876058031269),
              ('pORvWLJb_ioqI

In [42]:
tmp = pd.DataFrame.from_dict(all_pred, orient='index')
tmp_transpose = tmp.transpose()    

In [43]:
tmp_transpose.loc[:,'IQsF3Rc6IgCzjVV9DE8KXg']

0     (Mwc3n5Psw9wRaQ22vZWDYQ, 4.049124202191984)
1     (zpKTPWoW56wF6d9qNnxM3Q, 4.021739380748146)
2     (05ev984NYfimRN0UiFrxaA, 3.960472313521455)
3     (DZglT5FP5VNDQJU3PuRIHw, 3.886143803866511)
4    (gwxNttwUgmOn3vsHYe0pDQ, 3.6785815320318327)
5    (KHQXmUFiAD0FHvrMSakJBA, 3.6115550694771237)
6     (wB1Tin0OW1JRpaKM-E3ZYA, 3.567675278032605)
7     (WP9GAuhvmUhm8MAxMqhgrQ, 3.412495495242629)
8     (9o55tW2eCwxRvwWVQSzj5g, 3.295160536278288)
9                                            None
Name: IQsF3Rc6IgCzjVV9DE8KXg, dtype: object

In [59]:
def get_predictions(user_id,n):
    
    all_pred = get_all_predictions(predictions)

    for uid, user_ratings in all_pred.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        all_pred[uid] = user_ratings[:n]
    tmp = pd.DataFrame.from_dict(all_pred, orient='index')
    tmp_transpose = tmp.transpose()    
    print("User ID:",user_id)
    results = tmp_transpose.loc[:,user_id]
    recommended_rest_ids=[]
    for x in range(0, n):
    #print(x)
        recommended_rest_ids.append(results[x][0])
        recommended_rest = business[business['business_id'].isin(recommended_rest_ids)]
    return recommended_rest

In [64]:
UID = 'TLNI1VHqpKDz8W7xQhIbAA'
n = 8
results = get_predictions(UID,n)
results

User ID: TLNI1VHqpKDz8W7xQhIbAA


Unnamed: 0,business_id,name
473,Oun4NN-u5yiHIxDqtJnxgA,Federal Donuts
485,S8ZFYEgMejpChID8tzKo9A,Amada
848,fEqiXG_B-fn__w0aeF3nBQ,Laurel
859,cGX-1IUwXOjkUqZbkKYcjw,Fogo de Chao
2013,trwHwsXOVV-ZmF-MtP98BA,High Street Philadelphia
2867,kZ1q0K13tFYG_ZJrVvsJHA,Sampan
3448,OAWa1WML2V1ZLJGD6V3nBQ,Middle Child
4284,2CDI713ATuxHfnB5b-sBdw,Vedge


In [48]:
recommended_rest_ids=[]
for x in range(0, n-2):
    #print(x)
    recommended_rest_ids.append(results[x][0])
recommended_rest = business[business['business_id'].isin(recommended_rest_ids)]

In [49]:
recommended_rest

Unnamed: 0,business_id,name
1036,wB1Tin0OW1JRpaKM-E3ZYA,National Mechanics
1093,KHQXmUFiAD0FHvrMSakJBA,Fare
1459,05ev984NYfimRN0UiFrxaA,Osteria
1980,9o55tW2eCwxRvwWVQSzj5g,Trio
2095,DZglT5FP5VNDQJU3PuRIHw,La Calaca Feliz
2766,Mwc3n5Psw9wRaQ22vZWDYQ,Bar Hygge
3309,WP9GAuhvmUhm8MAxMqhgrQ,Indian Kitchen Lovash
3609,gwxNttwUgmOn3vsHYe0pDQ,OCF Coffee House
3923,zpKTPWoW56wF6d9qNnxM3Q,Rybread


In [None]:
df[df.iid == '-0TffRSXXIlBYVbb5AwfTg']

In [None]:
df.head()


In [None]:
review.head()

In [None]:
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]


In [None]:
unread_ids

In [None]:
# Now we will loop over the books to extract the predictions
rev = []
for iid in unread_ids:
    rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [None]:
# Use as many boxes as you need
# I will use the predict function (because why not)
# First we need to obtain the ids of the unread books. 
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]

# Now we will loop over the books to extract the predictions
svd_rec = []
for iid in unread_ids:
    svd_rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [65]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [66]:
best_predictions


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
19697,nGBYFOdUMJSwui-i5WYtGw,JVDHxMnKjif8XdXVFWiClg,5.0,5.0,{'was_impossible': False},32,176,0.0
25735,ygQxFoytqvt6W0E2NQPJow,30cpm1uq6xcQCmN6JTrB8w,5.0,5.0,{'was_impossible': False},82,169,0.0
26171,T4Uk_zyBFvIUsBVninUqRg,0X5TTmfBPPY98Ra2CcSw0Q,5.0,5.0,{'was_impossible': False},55,63,0.0
21406,RULNu9f_GvtcOcz_gf0YGA,atZ_olNKXOG4rEr6mccN8g,5.0,5.0,{'was_impossible': False},47,208,0.0
7526,DujXxxlIKTGEPso-C-Rq5A,ytynqOUb3hjKeJfRj5Tshw,5.0,5.0,{'was_impossible': False},42,415,0.0
5978,Y0MjA5a67vfzttu7TENETQ,ozOneB4jXOD6hv5WBGj4KQ,5.0,5.0,{'was_impossible': False},83,38,0.0
12890,4PE0tDvV9Lo8VW8kDW1SFQ,6_T2xzR74JqGCTPefAD8Tw,5.0,5.0,{'was_impossible': False},29,201,0.0
15933,Y0MjA5a67vfzttu7TENETQ,dYinIkKBspHV5hSaukklFg,5.0,5.0,{'was_impossible': False},83,51,0.0
21403,ygQxFoytqvt6W0E2NQPJow,2CDI713ATuxHfnB5b-sBdw,5.0,5.0,{'was_impossible': False},82,197,0.0
28099,A4WMwbeMsE8NZ8DJfOMONg,-cEFKAznWmI0cledNOIQ7w,5.0,5.0,{'was_impossible': False},44,173,0.0


In [67]:
worst_predictions


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
24277,pljgkZSB60BmtbkM6PvsGA,MTSW4McQd7CbVtyjqoe9mw,1.0,4.461954,{'was_impossible': False},40,17,3.461954
4155,0jFiKymwqrjQD-kqLMkadg,RQAF6a0akMiot5lZZnMNNw,1.0,4.462807,{'was_impossible': False},53,155,3.462807
6871,LC0s_RIoX0MMrCYUpnzQjA,RQAF6a0akMiot5lZZnMNNw,1.0,4.482196,{'was_impossible': False},40,155,3.482196
24861,XxG5SZOPkihWeJe3r9XL8Q,mXQ5qcjeVZKBbyWrdQ2Z0A,1.0,4.483469,{'was_impossible': False},50,9,3.483469
13708,5cBooky8Y5_q5FV2zK-hPg,Sv1MEZP-mMfp8SmE0hwYEA,1.0,4.486112,{'was_impossible': False},19,263,3.486112
18461,VYuuNzTWju8D0rr8SvaZjA,wbDRmtxaKRpBOjutvV6TEA,1.0,4.487805,{'was_impossible': False},82,111,3.487805
4101,bAtvLtIOCwt72zFXiBDYlg,HngRiL6Z3DFTDnNsXZ0TQw,1.0,4.517493,{'was_impossible': False},40,79,3.517493
2474,XrWAdRK4CUUK85Ak3x-HDw,d48Xrx8MhGtdaLvhcYzNWQ,1.0,4.541701,{'was_impossible': False},50,63,3.541701
11100,DjWTA9NjPrK4y7tfMYSlnA,NAfITNb2HzObDVktNs3DmA,1.0,4.552309,{'was_impossible': False},21,3,3.552309
21909,84lcVCz48xx5Jpfi4rB2Zg,mtvT7uRey3F395STFRM1Tg,1.0,4.986722,{'was_impossible': False},36,183,3.986722


In [None]:
df[df.uid == '_7bHUi9Uuf5__HHc_Q8guQ']

In [None]:
df_new.loc[df_new['business_id'] == '3VA5BgRcVd-mAvSFJFFveQ']['stars'].describe()


In [70]:
import matplotlib.pyplot as plt
%matplotlib notebook

df_new.loc[df_new['business_id'] == 'mtvT7uRey3F395STFRM1Tg']['stars'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings restaurant mtvT7uRey3F395STFRM1Tg has received')
plt.show();


<IPython.core.display.Javascript object>