In [1]:
import pandas as pd
import numpy as np
import ast

#To Ignore Warnings in Output
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load in the data
business = pd.read_csv('../data/output_csv/business_PA_Philly_clean.csv')
review = pd.read_csv('../data/output_csv/review_PA_Philly_clean.csv')

In [3]:
business.head()

Unnamed: 0,business_id,name,address,city,postal_code,latitude,longitude,stars,review_count,categories,...,Seafood Markets,Wraps,Shaved Ice,Cupcakes,Greek,Flowers & Gifts,Home & Garden,French,Candy Stores,Chocolatiers & Shops
0,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,19107,39.955505,-75.155564,4.0,80,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,Philadelphia,19106,39.953949,-75.143226,4.0,245,"Sushi Bars, Restaurants, Japanese",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,Philadelphia,19147,39.943223,-75.162568,4.5,205,"Korean, Restaurants",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,19123,39.962582,-75.135657,3.5,65,"Eatertainment, Arts & Entertainment, Brewpubs,...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,Philadelphia,19104,39.954573,-75.194894,3.0,56,"Restaurants, Automotive, Delis, Gas Stations, ...",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
review.head()

Unnamed: 0,review_id,user_id,business_id,stars,year
0,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,2015
1,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5,2013
2,Xs8Z8lmKkosqW5mw_sVAoA,IQsF3Rc6IgCzjVV9DE8KXg,eFvzHawVJofxSnD7TgbZtg,5,2014
3,JBWZmBy69VMggxj3eYn17Q,aFa96pz67TwOFu4Weq5Agg,kq5Ghhh14r-eCxlVmlyd8w,5,2018
4,YcLXh-3UC9y6YFAI9xxzPQ,G0DHgkSsDozqUPWtlxVEMw,oBhJuukGRqPVvYBfTkhuZA,4,2015


In [5]:
business = business[['business_id','name']]

In [6]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype


user_u = list(sorted(review.user_id.unique()))
business_u = list(sorted(review.business_id.unique()))

cat_type_user = CategoricalDtype(categories=user_u, ordered=True)
cat_type_business = CategoricalDtype(categories=business_u, ordered=True)

row = review.user_id.astype(cat_type_user).cat.codes
col = review.business_id.astype(cat_type_business).cat.codes

data = review['stars'].tolist()

sparse_matrix = csr_matrix((data, (row, col)), shape=(len(user_u), len(business_u)))


In [7]:
ratings = pd.DataFrame.sparse.from_spmatrix(sparse_matrix,index=user_u, columns=business_u)

# Recommendation Engine - 2

In [8]:
ratings.fillna(0, inplace = True)

In [9]:
# Matrix Factorization

In [10]:
def matrix_factorization(R, P, Q, K, steps=3, alpha=0.0002, beta=0.02):
    '''
    Inputs:
    R     : The ratings (of dimension M x N)
    P     : an initial matrix of dimension M x K
    Q     : an initial matrix of dimension N x K
    K     : the number of latent features
    steps : the maximum number of steps to perform the optimization
    alpha : the learning rate
    beta  : the regularization parameter

    Outputs:
    the final matrices P and Q
    '''

    for step in range(steps):
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0: # Skipping over missing ratings
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(R.shape[0]):
            for j in range(R.shape[1]):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2) )
        if e < 0.001: # tolerance
            break
    print(e)
    return P, Q

In [None]:
def matrix_factorization(R, P, Q, K, steps=3, alpha=0.0002, beta=0.02):
    '''
    R: rating matrix
    P: |U| * K (User features matrix)
    Q: |D| * K (Item features matrix)
    K: latent features
    steps: iterations
    alpha: learning rate
    beta: regularization parameter'''
    Q = Q.T

    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    # calculate error
                    eij = R[i][j] - numpy.dot(P[i,:],Q[:,j])

                    for k in range(K):
                        # calculate gradient with a and beta parameter
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])

        eR = numpy.dot(P,Q)

        e = 0

        for i in range(len(R)):

            for j in range(len(R[i])):

                if R[i][j] > 0:

                    e = e + pow(R[i][j] - numpy.dot(P[i,:],Q[:,j]), 2)

                    for k in range(K):

                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        # 0.001: local minimum
        if e < 0.001:

            break

    return P, Q.T

In [11]:
np.random.seed(862)

# Initializations
M = ratings.shape[0] # Number of users
N = ratings.shape[1] # Number of items
K = 3 # Number of latent features

# Initial estimate of P and Q
P = np.random.rand(M,K)
Q = np.random.rand(K,N)
rating_np = np.array(ratings)

In [12]:
P, Q = matrix_factorization(rating_np, P, Q, K)

3819239.9435558743


In [13]:
predicted_rating = np.matmul(P, Q)
predicted_rating = pd.DataFrame(predicted_rating, index = ratings.index, columns = ratings.columns)
predicted_rating.head()

Unnamed: 0,-0TffRSXXIlBYVbb5AwfTg,-1B9pP_CrRBJYPICE5WbRA,-3ArWZfDjfab8qVHf3WVtg,-3m_nXlyvdKAVNNmVirpGQ,-5Rah4ZvWsDu4oilUZxhtw,-63ytt5vkWof-M9NDGTkng,-6MEKOmFu6jckT3pruSxHg,-ATiAtTikuGuqvaW2O6tNA,-AanHawaDlzWHQjrqRRWig,-Bhoyo7LL97tgt9Hze0Saw,...,zuEdIZKAYBDfPjyFg6B34Q,zuKnCtZQKZqnvEaKVnwVVQ,zucC7rHpXPYBu7aEqj0NUw,zujdPV3HT-Y-CKE1GgkMHQ,zvvl3c1FO3O3BZdhusficA,zwTmOj4B_OVPMTMYijQiKg,zwd4dyQ5ovnjVojWfAuhMw,zxRmQ_FWVowh8rlzLCSURQ,zxY4DgtXsVHihSUpsmwamg,zz3E7kmJI2r2JseE6LAnrw
--2tyArRmSoyKx5r-FVG0A,5.314267,3.835629,1.756618,1.299018,2.721466,1.263684,1.804118,3.644182,2.124162,1.860988,...,1.697858,1.190403,1.838829,3.223896,1.81484,1.966486,2.682476,1.754576,1.564141,1.427228
--2vR0DIsmQ6WfcSzKWigw,1.556161,1.274888,0.532154,0.246572,0.97856,0.30942,0.47252,1.071161,0.603085,0.53129,...,0.551606,0.23021,0.627346,1.127644,0.405601,0.446497,0.809509,0.441081,0.49251,0.346855
--4AjktZiHowEIBCMd4CZA,3.197556,2.00591,0.887455,0.899234,1.375504,0.929004,1.153721,2.255803,1.346549,1.134022,...,0.928929,0.925634,0.951167,1.602804,1.226871,1.211047,1.574414,1.263959,1.017322,0.986978
--4_p6Z3tKadJcr9Non_Vw,1.852312,1.206513,0.527861,0.491143,0.843756,0.517316,0.654707,1.303411,0.772526,0.653744,...,0.553088,0.503155,0.575697,0.980539,0.681168,0.678246,0.918564,0.706816,0.590205,0.551738
--6GckBYtTa4hj8pT09oAg,4.58719,3.395031,1.717542,1.259265,2.307853,0.988663,1.587437,3.046273,1.779111,1.617211,...,1.473403,0.999282,1.605639,2.844103,1.661628,1.953336,2.317131,1.383518,1.162599,1.211422


In [14]:
UID = '--2tyArRmSoyKx5r-FVG0A'

In [15]:
# Obtain the missing ratings
missing_ratings = predicted_rating.loc[UID][ratings.loc[UID,:]==0]

# Attach it with indices
missing_ratings = pd.Series(missing_ratings, index = ratings.columns[ratings.loc[UID,:] == 0] )

# Sort the ratings
missing_ratings.sort_values(ascending = False, inplace = True)

In [16]:
 
    
# Recommendations
mat_fact = []
for i in range(10):
    rec_rest_id = missing_ratings.index[i]
    mat_fact.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", missing_ratings.iloc[i])
    
    


my number  1  recommendation is  Reading Terminal Market , with a predicted rating of 7.137943173193993
my number  2  recommendation is  Zahav , with a predicted rating of 6.902600715644164
my number  3  recommendation is  Barbuzzo , with a predicted rating of 6.435327891074762
my number  4  recommendation is  Parc , with a predicted rating of 6.296678759075702
my number  5  recommendation is  Dalessandro’s Steaks & Hoagies , with a predicted rating of 6.295562074519323
my number  6  recommendation is  Talula's Garden , with a predicted rating of 6.280827263971357
my number  7  recommendation is  The Dandelion , with a predicted rating of 6.234583541743093
my number  8  recommendation is  Green Eggs Café , with a predicted rating of 6.060497507054726
my number  9  recommendation is  Beiler's Bakery , with a predicted rating of 6.043984703543716
my number  10  recommendation is  Nan Zhou Hand Drawn Noodle House , with a predicted rating of 6.028433120804905


In [17]:
mat_fact

['Reading Terminal Market',
 'Zahav',
 'Barbuzzo',
 'Parc',
 'Dalessandro’s Steaks & Hoagies',
 "Talula's Garden",
 'The Dandelion',
 'Green Eggs Café',
 "Beiler's Bakery",
 'Nan Zhou Hand Drawn Noodle House']

# SVD

In [18]:
# Load the libraries
from surprise import Reader
from surprise import Dataset
from surprise.prediction_algorithms.matrix_factorization import SVD


In [19]:
# Step 1: Set up the reader class
reader = Reader(rating_scale=(1,5))


In [20]:
# Step 2: Load the dataframe. Use the merged data from above (not the pivoted data)
data = Dataset.load_from_df(review[['user_id', 'business_id', 'stars']], reader)


In [21]:
# Step 3: Build the train set
svd_data = data.build_full_trainset()


In [22]:
# Use as many boxes as you need
# Set up the model and fit the model. Note it will take a few minutes to run
svd = SVD(n_factors = 7, lr_all = 0.01, reg_all = 0.1, biased = True,verbose = True, random_state = 862)
svd.fit(svd_data)


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c3f87e4b80>

In [23]:
# Use as many boxes as you need
# I will use the predict function (because why not)
# First we need to obtain the ids of the unread books. 
unread_ids = ratings.columns[ratings.loc[UID,:] == 0]

# Now we will loop over the books to extract the predictions
svd_rec = []
for iid in unread_ids:
    svd_rec.append(svd.predict(uid=UID,iid=iid).est)

# Put the result in a pd Series and sort
svd_rec = pd.Series(svd_rec, index = unread_ids).sort_values(ascending=False)


In [24]:
# Recommendations
svd_pp = []
for i in range(10):
    rec_rest_id = svd_rec.index[i]
    svd_pp.append(business[business['business_id'] == rec_rest_id]['name'].values[0])
    print("my number ", i+1, " recommendation is ", business[business['business_id'] == rec_rest_id]['name'].values[0], 
          ", with a predicted rating of", svd_rec.iloc[i])

my number  1  recommendation is  Dan's Fresh Meats , with a predicted rating of 4.880313404372395
my number  2  recommendation is  Taco & Ramen , with a predicted rating of 4.815003967502807
my number  3  recommendation is  El Molino Tortilleria and Restaurant , with a predicted rating of 4.811596904963732
my number  4  recommendation is  Safa Persian Teahouse , with a predicted rating of 4.788428484102862
my number  5  recommendation is  Cafe Mi Quang , with a predicted rating of 4.778626685295353
my number  6  recommendation is  Haggerty Deli , with a predicted rating of 4.778290717152441
my number  7  recommendation is  Philly Foodworks , with a predicted rating of 4.766924330272553
my number  8  recommendation is  Otolith Sustainable Seafood , with a predicted rating of 4.764872132987969
my number  9  recommendation is  Ramona Susan's Bake Shop , with a predicted rating of 4.744659622777604
my number  10  recommendation is  Veghada , with a predicted rating of 4.740254916088864


range(1, 15)

In [59]:
def rmse_vs_factors(algorithm, data):
  """
  Returns: rmse_algorithm i.e. a list of mean RMSE of CV = 5 in cross_validate() for each  factor k in range(1, 26, 1)
  25 values 
  Arg:  i.) algorithm = Matrix factoization algorithm, e.g SVD/NMF/PMF, ii.)  data = surprise.dataset.DatasetAutoFolds
  
  """
  rmse_algorithm = []
  
  for k in range(1, 26, 1):
    algo = algorithm(n_factors = k)
    
    #["test_rmse"] is a numpy array with min accuracy value for each testset
    loss_fce = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)["test_rmse"].mean() 
    rmse_algorithm.append(loss_fce)
  
  return rmse_algorithm

In [60]:
rmse_svd = rmse_vs_factors(SVD,data)


In [65]:
def plot_rmse(rmse, algorithm):
    plt.figure(num=None, figsize=(11, 5), dpi=80, facecolor='w', edgecolor='k')
    plt.subplot(2,1,1)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.title("{0} Performance: RMSE Against Number of Factors".format(algorithm), size = 20 )
    plt.ylabel("Mean RMSE (cv=5)")
    
    plt.subplot(2,1,2)
    plt.plot(rmse)
    plt.xlim(0,25)
    plt.xticks(np.arange(0, 25, step=2))
    plt.xlabel("{0}(n_factor = k)".format(algorithm))
    plt.ylabel("Mean RMSE (cv=5)")
    plt.axvline(np.argmin(rmse), color = "r")
    print("Best k seems to be : ",np.argmin(rmse))

In [66]:
plot_rmse(rmse_svd,"SVD")


<IPython.core.display.Javascript object>

Best k seems to be :  7


In [25]:
review = review.drop(columns = ['review_id','year'])

In [26]:
min_rest_ratings = 30
filter_rests = review['business_id'].value_counts() > min_rest_ratings
filter_rests = filter_rests[filter_rests].index.tolist()

min_user_ratings = 30
filter_users = review['user_id'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = review[(review['business_id'].isin(filter_rests)) & (review['user_id'].isin(filter_users))]
print('The original data frame shape:\t{}'.format(review.shape))
print('The new data frame shape:\t{}'.format(df_new.shape))

The original data frame shape:	(547794, 3)
The new data frame shape:	(114883, 3)


In [27]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_new[['user_id', 'business_id', 'stars']], reader)

In [28]:
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [29]:

benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(),SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.91507,0.144198,0.214684
SVDpp,0.918248,6.338886,2.988062
SVD,0.920621,1.043645,0.278379
KNNBaseline,0.923973,0.483334,4.610471
KNNWithMeans,0.933257,0.38864,4.022234
SlopeOne,0.937409,0.347531,1.750118
KNNWithZScore,0.938239,0.482135,4.453209
CoClustering,0.947384,2.169742,0.171123
KNNBasic,0.963039,0.362204,3.756023
NMF,0.970738,1.669649,0.256641


In [30]:
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')


In [31]:
surprise_results


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.91507,0.144198,0.214684
SVDpp,0.918248,6.338886,2.988062
SVD,0.920621,1.043645,0.278379
KNNBaseline,0.923973,0.483334,4.610471
KNNWithMeans,0.933257,0.38864,4.022234
SlopeOne,0.937409,0.347531,1.750118
KNNWithZScore,0.938239,0.482135,4.453209
CoClustering,0.947384,2.169742,0.171123
KNNBasic,0.963039,0.362204,3.756023
NMF,0.970738,1.669649,0.256641


In [32]:
#Base Line Only has the best test rmse
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9083049 , 0.91419789, 0.91455555]),
 'fit_time': (0.08369231224060059, 0.09122776985168457, 0.09026527404785156),
 'test_time': (0.1757032871246338, 0.29560160636901855, 0.10879898071289062)}

In [33]:
trainset, testset = train_test_split(data, test_size=0.25)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)


Estimating biases using als...
RMSE: 0.9027


0.90271981218981

In [34]:
trainset = algo.trainset
print(algo.__class__.__name__)

BaselineOnly


In [35]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)


In [36]:
df.head()


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,y7H-4bWuX3eqC7C11DvQcA,zAHzitys-ncDZi0nVW_qsw,5.0,3.637219,{'was_impossible': False},61,19,1.362781
1,2xWGCfc2GeTEaspPPPKSOA,9w6o_eyKCSNKgMtAuv6KWg,3.0,4.051187,{'was_impossible': False},59,92,1.051187
2,XuhpPK0bZgVenkGBh7KA6w,qBwOLby0sOJd3QG1o76W2w,4.0,3.574657,{'was_impossible': False},48,94,0.425343
3,q76sFjjZr3QhUzsA8N1jmg,YN4Kk751tmdvoarGo8z7_A,5.0,4.263453,{'was_impossible': False},22,134,0.736547
4,c2cd2fJWzvNFxcjQPDvKmw,e_lqKAWIiE8wU6mk2HPI-Q,3.0,3.778016,{'was_impossible': False},46,10,0.778016


In [37]:
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [38]:
best_predictions


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
22239,6kJFLAHV-tNsBEZaRTqEWQ,Q9poenJ9SS3agpPo4Pgyyg,5.0,5.0,{'was_impossible': False},125,37,0.0
2163,lxA2xhPoGyEKdfKkR7u7yw,auwFZzfhe2pvFw43OfsAfw,5.0,5.0,{'was_impossible': False},59,28,0.0
19170,lxA2xhPoGyEKdfKkR7u7yw,_3NZVCWeyFZ1_vWqi2avjw,5.0,5.0,{'was_impossible': False},59,38,0.0
22102,6kJFLAHV-tNsBEZaRTqEWQ,Q9poenJ9SS3agpPo4Pgyyg,5.0,5.0,{'was_impossible': False},125,37,0.0
20814,fwC5lm7Wz6G4SrEWyg8B4g,rQW9iupvhk6ScPn2VPNLVQ,5.0,5.0,{'was_impossible': False},20,59,0.0
3517,T4Uk_zyBFvIUsBVninUqRg,vUrTGX_7HxqeoQ_6QCVz6g,5.0,5.0,{'was_impossible': False},60,207,0.0
9410,v0ZU3_KMW2Cb9s9vdu4OdQ,6KXJk2AWH3NxZBuCdkggVQ,5.0,5.0,{'was_impossible': False},25,26,0.0
12802,ygQxFoytqvt6W0E2NQPJow,i_FWONQD1ZBqrNE2b-M5Ug,5.0,5.0,{'was_impossible': False},64,244,0.0
8262,A4WMwbeMsE8NZ8DJfOMONg,Ipkx4Sa7ybn8C6LtTqTztw,5.0,5.0,{'was_impossible': False},47,236,0.0
3313,nGBYFOdUMJSwui-i5WYtGw,vUrTGX_7HxqeoQ_6QCVz6g,5.0,5.0,{'was_impossible': False},29,207,0.0


In [39]:
worst_predictions


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
17260,-g8Cl0OINIPcGf1gZsyjlw,-OIUunijjcq_ZzyyQhPPFQ,1.0,4.492082,{'was_impossible': False},26,103,3.492082
6517,pljgkZSB60BmtbkM6PvsGA,MTSW4McQd7CbVtyjqoe9mw,1.0,4.496004,{'was_impossible': False},43,20,3.496004
9829,0jFiKymwqrjQD-kqLMkadg,RQAF6a0akMiot5lZZnMNNw,1.0,4.515865,{'was_impossible': False},59,159,3.515865
5475,5jKScvMqxW_rzd0D8xgX1A,IWHdx0NhDKADkGOgXgOFKQ,1.0,4.525478,{'was_impossible': False},70,168,3.525478
21406,DwyevLE3RWfuBzgcJtRFyg,qISf5ojuYbD9h71NumGUQA,1.0,4.59118,{'was_impossible': False},35,278,3.59118
22988,wpMvNi8YZsBDJXf0YYzxzQ,WG23hgY8Ld9yaC2clgfD_g,1.0,4.616385,{'was_impossible': False},103,16,3.616385
27811,rnH9fiCEiYrr84zr2joLHA,oZXf53yjY7i-1tUHxSLayg,1.0,4.721836,{'was_impossible': False},28,10,3.721836
21059,EO4goAgwcbaRx3x-eQf4dg,KKivjYzOLDlbAo-iMyqWWg,1.0,4.740312,{'was_impossible': False},36,11,3.740312
12572,6kJFLAHV-tNsBEZaRTqEWQ,GHbq2lZcy0uuiZLAleeS3A,1.0,4.779,{'was_impossible': False},125,11,3.779
1272,ys47WpP2diRD1NTnRxNXIQ,dtsnh96yn64IT7sErc509g,1.0,4.790443,{'was_impossible': False},21,5,3.790443


In [40]:
df_new.loc[df_new['business_id'] == '3VA5BgRcVd-mAvSFJFFveQ']['stars'].describe()


count    9.000000
mean     4.111111
std      1.269296
min      1.000000
25%      4.000000
50%      4.000000
75%      5.000000
max      5.000000
Name: stars, dtype: float64

In [41]:
import matplotlib.pyplot as plt
%matplotlib notebook

df_new.loc[df_new['business_id'] == '3VA5BgRcVd-mAvSFJFFveQ']['stars'].hist()
plt.xlabel('rating')
plt.ylabel('Number of ratings')
plt.title('Number of ratings restaurant 3VA5BgRcVd-mAvSFJFFveQ has received')
plt.show();


<IPython.core.display.Javascript object>