# Using Pandas read_json function:

In [1]:
import pandas as pd
import numpy as np

In [2]:
# with open('yelp_dataset/yelp_academic_dataset_tip.json') as file:
#         tip = pd.read_json(file, lines = True)

In [3]:
with open('yelp_dataset/yelp_academic_dataset_review.json') as file:
        review = pd.read_json(file, lines = True)

In [4]:
with open('yelp_dataset/yelp_academic_dataset_business.json') as file:
        business = pd.read_json(file, lines = True)

In [5]:
# with open('yelp_dataset/yelp_academic_dataset_checkin.json') as file:
#         checkin = pd.read_json(file, lines = True)

In [6]:
with open('yelp_dataset/yelp_academic_dataset_user.json') as file:
        user = pd.read_json(file, lines = True)

In [6]:
# dfs = ['tip', 'review', 'business', 'checkin', 'user']

tip:  ['business_id' 'date' 'likes' 'text' 'type' 'user_id']
review:  ['business_id' 'date' 'review_id' 'stars' 'text' 'type' 'user_id' 'votes']
business:  ['attributes' 'business_id' 'categories' 'city' 'full_address' 'hours'
 'latitude' 'longitude' 'name' 'neighborhoods' 'open' 'review_count'
 'stars' 'state' 'type']
checkin:  ['business_id' 'checkin_info' 'type']
user:  ['average_stars' 'compliments' 'elite' 'fans' 'friends' 'name'
 'review_count' 'type' 'user_id' 'votes' 'yelping_since']

# Building a recommender system
First, get the relevant data: user-business stars... create matrices Y, R

** Y ** that contains #business as rows, #user as columns, values = stars

** R ** same size as Y, values = 1 if a user has rated a business **



# Need to determine businesses and users to include in the model

## Drop businesses and users with review counts less than 20

In [7]:
user = user[user.review_count > 20]

In [8]:
user.shape

(148994, 11)

In [9]:
business.shape

(85901, 15)

In [10]:
business = business[business.review_count > 20]

In [11]:
business.shape

(25680, 15)

In [12]:
business.groupby('state').size()

state
AZ     10754
BW       133
CA         3
EDH      525
FIF        1
IL       253
MLN       15
NC      1897
NV      8645
ON        85
PA      1315
QC      1125
RP         1
SC        68
TX         1
WI       858
XGL        1
dtype: int64

## Drop states with less than 20 businesses (recommend within state if #>20)

In [13]:
business = business.groupby('state').filter(lambda r: len(r) > 20)

In [14]:
business.groupby('state').size()

state
AZ     10754
BW       133
EDH      525
IL       253
NC      1897
NV      8645
ON        85
PA      1315
QC      1125
SC        68
WI       858
dtype: int64

In [15]:
business.shape

(25658, 15)

In [16]:
# business.shape

## Merge review+business+user

In [17]:
rev_biz_usr = pd.merge(pd.merge(review, business, on='business_id'), user, on='user_id')

In [18]:
rev_biz_usr.shape

(1251142, 32)

### drop users and businesses with less than 20 reviews in the review dataframe  
run multiple times until steady state

In [19]:
#rev_biz_usr.groupby('user_id').size().reset_index(name='count').sort_values(by='count', ascending=True)

In [39]:
rev_biz_usr = rev_biz_usr.groupby('business_id').filter(lambda r: len(r) >= 20)

rev_biz_usr = rev_biz_usr.groupby('user_id').filter(lambda r: len(r) >= 20)

In [40]:
rev_biz_usr.shape # (487156, 32)

(487156, 3)

### Remove not needed columns from the dataframe, to reduce memory use:

In [41]:
col_drop = rev_biz_usr.columns.difference(['business_id', 'user_id', 'stars_x'])

In [42]:
rev_biz_usr.drop(col_drop, axis=1, inplace=True)

In [43]:
rev_biz_usr.shape

(487156, 3)

### Convert dtype of stars to int32 instead of int64 (since their values are from 1-5), also to reduce memory use:

In [44]:
rev_biz_usr.stars_x = rev_biz_usr.stars_x.astype(int)

## Pivoting the table needed for the recommender system algorithm

#### Before filtering users and businesses, the following pivot table gave an error due to out of memory, not negagtive dimensions. However, after reducing dataset, the pivoted table fits in my laptop's 32GB memory (total code occupying nearly 8GB):
memory needed before reducing the dataset, assuming 8 bytes per value is:
686556 x 85539 x 8 / 1024 / 1024/ 1024 = 438 GB

In [45]:
# memory requirements for pivoting the table after reducing the dataset is still large ~ 27.5 GB
# Now the table can fit in memory ~ 1.33 GB
len(rev_biz_usr.business_id.unique()) * len(rev_biz_usr.user_id.unique()) * 8 / 1024/1024/1024

0.6321526765823364

In [46]:
# rev_biz_usr.pivot(index='business_id', columns='user_id', values='stars_x')

In [47]:
rev_biz_usr.shape

(487156, 3)

In [48]:
rev_biz_usr.memory_usage()

Index          3897248
business_id    3897248
stars_x        1948624
user_id        3897248
dtype: int64

### Note that some users rated the same business multiple times, pivot table will save the average value, which is acceptable

In [49]:
rev_biz_usr[(rev_biz_usr.business_id == '-584fn2GxYe9sLsgN2WeQA') & (rev_biz_usr.user_id == 'qqtmFGhyyHc37coT4qJxaw')]

Unnamed: 0,business_id,stars_x,user_id
649855,-584fn2GxYe9sLsgN2WeQA,5,qqtmFGhyyHc37coT4qJxaw
649856,-584fn2GxYe9sLsgN2WeQA,4,qqtmFGhyyHc37coT4qJxaw
649857,-584fn2GxYe9sLsgN2WeQA,5,qqtmFGhyyHc37coT4qJxaw


In [50]:
review[(review.business_id == '-584fn2GxYe9sLsgN2WeQA') & (review.user_id == 'qqtmFGhyyHc37coT4qJxaw')]

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
1053862,-584fn2GxYe9sLsgN2WeQA,2011-12-03,79Pag_9zldIyGxgsp8MwMQ,5,It had been many months since I ate a genuine ...,review,qqtmFGhyyHc37coT4qJxaw,"{'cool': 4, 'useful': 7, 'funny': 1}"
1053890,-584fn2GxYe9sLsgN2WeQA,2012-12-03,F0vCpCrXoomcSGSH87DHcw,4,"Unfortunately, I am going to have to take Grea...",review,qqtmFGhyyHc37coT4qJxaw,"{'cool': 5, 'useful': 5, 'funny': 4}"
1053964,-584fn2GxYe9sLsgN2WeQA,2014-08-10,B5Qowrss3JnGKWag_CEIJw,5,It had been over 18 months since my last visit...,review,qqtmFGhyyHc37coT4qJxaw,"{'cool': 2, 'useful': 2, 'funny': 1}"


## The Y Matrix for the recommender system

### Do not fill NaNs with zeros as zeros occupy more memory than NaNs, and the pivoting command will run out of memory.

In [51]:
Y = pd.pivot_table(rev_biz_usr, index='business_id', columns='user_id', values='stars_x')#, fill_value=0)

In [52]:
Y.loc['-584fn2GxYe9sLsgN2WeQA', 'qqtmFGhyyHc37coT4qJxaw']

4.666666666666667

In [53]:
(5+4+5)/3

4.666666666666667

In [54]:
Y.shape # before dropna: (8208, 10337)

(8208, 10337)

In [55]:
Y = Y.dropna(thresh=20, axis=1)

In [56]:
Y.shape

(8208, 9773)

In [57]:
Y = Y.dropna(thresh=20, axis=0)

In [58]:
Y.shape

(7630, 9773)

In [67]:
# REPEAT A FEW TIME UNTIL WE GET A STABLE SHAPE of (7360, 9234)
Y = Y.dropna(thresh=20, axis=1)
Y = Y.dropna(thresh=20, axis=0)
Y.shape

(7360, 9234)

In [68]:
#Y.drop(Y.columns[Y.count(axis=0) < 20], axis=1)

In [69]:
#Y.columns[Y.count(axis=0) < 20]

In [70]:
(Y.count(axis=1) < 20).sum()

0

In [71]:
Y.count(axis=0).min()

20

In [72]:
7360*9234

67962240

In [73]:
Y.count().sum()

423296

In [74]:
7360*9234 - Y.count().sum()

67538944

In [75]:
Y.isnull().sum().sum()

67538944

In [76]:
Y.memory_usage().sum()/1024/1024/1024

0.5064129829406738

In [77]:
# len(rev_biz_usr.business_id.unique()) , len(rev_biz_usr.user_id.unique())

# Split the data as training and test sets:

Select a random number of columns (30% of total)users, and remove some of the given ratings (-> nan)


In [651]:
mask2 = np.random.choice(5, 3, replace=False)
mask2

array([2, 0, 1])

In [652]:
df = pd.DataFrame(np.random.randint(0, 100,(5,5)), columns=range(5))
df.to

Unnamed: 0,0,1,2,3,4
0,91,48,39,5,65
1,77,82,1,41,17
2,10,56,0,94,28
3,2,11,51,84,34
4,17,84,90,79,24


In [662]:
df.iloc[mask2,1]

2    56
0    48
1    82
Name: 1, dtype: int32

In [78]:
test_split_ratio = 0.3
test_size = int(Y.shape[1] * test_split_ratio)
train_size = Y.shape[1] - test_size
# rand_test_user_mask = random.sample(range(Y.shape[1]), test_size)
rand_column_mask = np.random.choice(Y.shape[1], test_size, replace=False)

In [79]:
# Since number of reviews by each user is > 20, select 5 ratings as test per test user
Y_test = Y.iloc[:,rand_column_mask].copy()

In [80]:
# Tried to define a function to mask random values in each column and apply it to the DF.
# However, the function is applied (by design) twice to the first column, resulting in an error
# Better use a simple for loop

for col in Y_test:
    mask_size = 5
    mask = np.random.choice(Y_test[col].notnull().nonzero()[0], mask_size, replace=False)
    Y_test[col][mask] = np.nan

In [81]:
Y_test.count().sum()

112499

## Locations of test values

In [125]:
value_locations_premask = Y.iloc[:,rand_column_mask][Y.iloc[:,rand_column_mask].notnull()].stack().index.tolist()

In [124]:
value_locations_masked = Y_test[Y_test.notnull()].stack().index.tolist()

In [219]:
test_values_locations = list(set(value_locations_premask) - set(value_locations_masked))

In [240]:
# Convert to dataframe in order to be able to use during lookup operation, which requires list of rows, and list of columns
test_values_locations = pd.DataFrame.from_records(test_values_locations, columns=['business_id', 'user_id'])

In [285]:
# Get location positions:
query_rows = test_values_locations.business_id
rows = Y.index.values
sidx = np.argsort(row)
row_ids = sidx[np.searchsorted(rows,query_rows,sorter=sidx)]

In [286]:
# Get location positions:
query_cols = test_values_locations.user_id
cols = Y.columns.values
sidx = np.argsort(cols)
col_ids = sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

In [288]:
col_ids[0:5]

array([4720, 4703, 3950, 5343, 7598], dtype=int64)

In [289]:
Y.columns.get_loc(test_values_locations.user_id[4])

7598

In [271]:
Y.index.get_loc(test_values_locations.business_id[5])

3769

In [278]:
np.searchsorted(cols,query_cols,sorter=sidx)

array([6657, 3326, 3445, ..., 6669, 5411,  635], dtype=int64)

In [314]:
col_ids

array([4720, 4703, 3950, ..., 5898, 2179, 3966], dtype=int64)

## Get test values 

In [364]:
(Y.lookup(test_values_locations.business_id, test_values_locations.user_id))

array([ 3.,  5.,  5., ...,  5.,  2.,  4.])

In [375]:
Y.values[row_ids, col_ids]

array([ 3.,  5.,  5., ...,  5.,  2.,  4.])

To test the masking steps:  
Test sample size = 0.3 * Original Size = 0.3 * 9234 = 2770  
Randomly pick 5 notnan values from these columns and set them to nan  
Number of nans should increase by 2770 * 5 = 13850

In [96]:
int(Y.shape[1]*0.3) * 5

13850

In [97]:
Y.iloc[:,rand_column_mask].count().sum()

126349

In [98]:
Y_test.count().sum()

112499

In [99]:
Y.iloc[:,rand_column_mask].count().sum() - Y_test.count().sum()

13850

# Now insert the new Nans into the original Y matrix

In [125]:
Y.iloc[:,rand_column_mask] = Y_test.copy()

In [126]:
Y.count().sum()

409446

## The R Matrix for the recommender system

In [127]:
# The R matrix shold be based on the new Y matrix

In [128]:
R = Y.notnull()

In [129]:
R.sum().sum()

409446

In [108]:
R = Y.notnull()

In [131]:
R.shape

(7360, 9234)

In [130]:
R.iloc[:3,:3]

user_id,--4TkB_iDShmg41Y_QW9nw,--65q1FpAL_UQtVZ2PTGew,--VxRvXk3b8FwsSbC2Zpxw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
--5jkZ3-nUPZxUvtcbr8Uw,False,False,False
--BlvDO_RG2yElKu9XA1_g,False,False,False
--UE_y6auTgq3FXlvUMkbw,False,False,False


In [111]:
R.sum().sum()

423296

In [132]:
R.memory_usage().sum()/1024/1024 # 170 MB

64.8699951171875

### To find the mean of Y rows using the 1s in R
**For accessing reviewed items only using R**

In [114]:
R.iloc[1,:].mean()

0.0029239766081871343

In [571]:
list(R.iloc[1,1:6])

[False, False, False, False, False]

In [572]:
Y.iloc[1,list(R.iloc[1,:])].mean()

4.0

In [177]:
Y.iloc[:6,:].mean(axis=1)

business_id
--5jkZ3-nUPZxUvtcbr8Uw    4.484848
--BlvDO_RG2yElKu9XA1_g    4.000000
--UE_y6auTgq3FXlvUMkbw    3.377778
--pOlFxITWnhzc7SHSIP0A    4.064815
-0Ackw6MF82PXO9f9Jh_Kg    3.424528
-0Oh0BEtQEC9OmmzZ_H5Bg    4.540541
dtype: float64

# Set the number of features, number of businesses and users:

In [134]:
num_features = 100
num_business = Y.shape[0]
num_user = Y.shape[1]

# Define the Cost Function
Initializing the parameters ($\theta$, X)

$\theta$: Model parameter 

X: model features

$Y = X * \theta^T$

for non zero values of R

$\theta$ shape = #users * #features

X shape = #business * #features

Y shape = #business * #users

Choose number of features = 100

In [464]:
def cost(params, Y, R, num_business, num_user, num_features, lamda):
    # lamda is the regularization coefficient lambda (python keyword)
    # Convert the dataframe to ndarray, fill nans with zeros, and leave the answer array for easier linear algebra operations
#     Y_mat = np.nan_to_num(Y.as_matrix())
#     R_mat = np.nan_to_num(R.as_matrix())

    # unfold X and theta from the 1D params array
    X = np.reshape(params[:num_business*num_features], (num_business, num_features))
    theta = np.reshape(params[num_business*num_features:], (num_user, num_features))    
    
    J = 0.5*np.sum(pow((X@theta.T - Y)*R,2)) + lamda/2*(np.sum(pow(theta,2)) + np.sum(pow(X,2)))
    
    X_grad = (X@theta.T - Y)*R@theta + lamda*X
    theta_grad = (X@theta.T - Y).T * R.T@X + lamda*theta
    
    grad = np.concatenate((np.ravel(X_grad), np.ravel(theta_grad)))
    print('The cost is currently equal to.........', J)
    return J, grad
    

In [465]:
J, grad = cost(params, Y, R, num_business, num_user, num_features, 0)
J

The cost is currently equal to......... user_id
--4TkB_iDShmg41Y_QW9nw     1579.746721
--65q1FpAL_UQtVZ2PTGew     5069.748460
--VxRvXk3b8FwsSbC2Zpxw     1190.275631
-0itF0VWVBe3k2AdfUReGA     2414.086425
-2EuoueswhqEERWezJY8gw     1301.227227
-2PFmYGWB6rQhZtRh6PlPA     3641.979564
-2r-OO0dtcCQQ3BLYdHXlg     1944.016249
-3HkgwyxVPf49MXM4dT8Tw     1837.270906
-3TxtJo7vjXYM3ROugCO_Q     1839.552412
-3akdU5UTDn6dwiTCO2cAw     3074.880855
-3iCfNaLfDFNHbox8qQIdg      782.779018
-3kLmioDGUc1eOsWEufKOQ     1102.846273
-3yWpMtAHNQqJRFw1zev6w     1429.604089
-4AMRN-7RtozD9s8ZsH04g     2337.811383
-4ld1qpaBNLocSBe85jWnw    10287.792374
-66ssufl05UYwJGSXfjKVg     2876.944763
-7G86_mYaOTLEzuWoyiSdQ     1519.068509
-8BqfYouq3o_UoazAQWwNw     3133.072678
-8gRkiYaVm3zfoQ4pcg75w     2351.317342
-9G7cH_cBwwgcJFLrSynRA     2005.403361
-9ZYvyfXviqy-i6TrEp4qw     1096.448169
-9mljS0O9mwdOnGv-Oea3Q     2235.300937
-ANkfLbDf8aiBQ7vywIL6w     3807.958925
-AXrmhjaQ8PbZJFe5V9Hjw     2268.342612
-BVv1TDLLphHzgKw

user_id
--4TkB_iDShmg41Y_QW9nw     1579.746721
--65q1FpAL_UQtVZ2PTGew     5069.748460
--VxRvXk3b8FwsSbC2Zpxw     1190.275631
-0itF0VWVBe3k2AdfUReGA     2414.086425
-2EuoueswhqEERWezJY8gw     1301.227227
-2PFmYGWB6rQhZtRh6PlPA     3641.979564
-2r-OO0dtcCQQ3BLYdHXlg     1944.016249
-3HkgwyxVPf49MXM4dT8Tw     1837.270906
-3TxtJo7vjXYM3ROugCO_Q     1839.552412
-3akdU5UTDn6dwiTCO2cAw     3074.880855
-3iCfNaLfDFNHbox8qQIdg      782.779018
-3kLmioDGUc1eOsWEufKOQ     1102.846273
-3yWpMtAHNQqJRFw1zev6w     1429.604089
-4AMRN-7RtozD9s8ZsH04g     2337.811383
-4ld1qpaBNLocSBe85jWnw    10287.792374
-66ssufl05UYwJGSXfjKVg     2876.944763
-7G86_mYaOTLEzuWoyiSdQ     1519.068509
-8BqfYouq3o_UoazAQWwNw     3133.072678
-8gRkiYaVm3zfoQ4pcg75w     2351.317342
-9G7cH_cBwwgcJFLrSynRA     2005.403361
-9ZYvyfXviqy-i6TrEp4qw     1096.448169
-9mljS0O9mwdOnGv-Oea3Q     2235.300937
-ANkfLbDf8aiBQ7vywIL6w     3807.958925
-AXrmhjaQ8PbZJFe5V9Hjw     2268.342612
-BVv1TDLLphHzgKw-eAJJQ     2326.709094
-DLZvdwIXxpBKSV6Y

In [466]:
J.sum()

23654851.02342555

In [342]:
# checking some numpy matrix operations
a = np.array([[1.,2],[3,4]])
print('a=', a)

b = np.array([[5,6],[7,8]])
print('b=', b)

print(a*b)

print(a@b)

c = np.asmatrix(a)
print('c=', c)

print(c*b)

np.sum(c)

10.0

# initialize X, $\theta$

In [136]:
X = np.random.randn(num_business, num_features)
theta = np.random.randn(num_user, num_features)

params = np.concatenate((np.ravel(X), np.ravel(theta)))

In [138]:
params.shape

(1659400,)

In [139]:
X.shape, theta.shape

((7360, 100), (9234, 100))

In [210]:
# X1 = np.reshape(params[:num_business*num_features], (num_business, num_features))
# theta1 = np.reshape(params[num_business*num_features:], (num_user, num_features))

# Initialize and train the model:

- Continue with parameter initialization, mean normalization, then find predictions
- Split the dataset to evqaluate the model


In [None]:
from scipy.optimize import minimize

In [263]:
idx = np.where(R_mat[i,:] == 1)[0]
idx

array([ 670,  726,  761,  920, 1025, 1959, 2107, 2295, 2771, 2866, 2961,
       3096, 3150, 3300, 3390, 4854, 4874, 5032, 5169, 5233, 5534, 5632,
       6016, 6023, 6060, 6281, 6376, 6475, 6620, 6644, 6695, 6703, 6818,
       6833, 7036, 7154, 7217, 7282, 7353, 7726, 7754, 7999, 8074, 8743,
       8798, 8915, 8940], dtype=int64)

In [485]:
lamda = 10

X = np.random.randn(num_business, num_features)
theta = np.random.randn(num_user, num_features)
params = np.concatenate((np.ravel(X), np.ravel(theta)))

Y_mat = np.nan_to_num(Y.as_matrix())
R_mat = np.nan_to_num(R.as_matrix())

# Ymean = np.zeros((num_business, 1))  
# Ynorm = np.zeros((num_business, num_user))
Ymean = np.nanmean(Y_mat, axis=1, keepdims=True)
Ynorm = np.nan_to_num(Y.subtract(Y.mean(axis=1), axis=0).as_matrix())

# for i in range(num_business):  
#     idx = np.where(R_mat[i,:] == 1)[0]
#     Ymean[i] = Y.iloc[i,idx].mean()
#     Ynorm[i,idx] = Y.iloc[i,idx] - Ymean[i]

# Ynorm = Y - Y.mean()

In [486]:
params = np.concatenate((np.ravel(X), np.ravel(theta)))
fmin = minimize(fun=cost, x0=params, args=(Ynorm, R_mat, num_business, num_user, num_features, lamda),  
                method='CG', jac=True, options={'maxiter': 100})
X = np.matrix(np.reshape(fmin.x[:num_business * num_features], (num_business, num_features)))  
theta = np.matrix(np.reshape(fmin.x[num_business * num_features:], (num_user, num_features)))



The cost is currently equal to......... 28893827.0573
The cost is currently equal to......... 28893827.0573
The cost is currently equal to......... 28757265.6785
The cost is currently equal to......... 28217747.3649
The cost is currently equal to......... 26164889.2655
The cost is currently equal to......... 19488558.7857
The cost is currently equal to......... 9526265.1544
The cost is currently equal to......... 168949636.253
The cost is currently equal to......... 6795635.23787
The cost is currently equal to......... 8648479.80321
The cost is currently equal to......... 5733474.09161
The cost is currently equal to......... 4791139.39894
The cost is currently equal to......... 3925059.0348
The cost is currently equal to......... 3156379.7263
The cost is currently equal to......... 4081936.40704
The cost is currently equal to......... 2785793.27624
The cost is currently equal to......... 2598065.25002
The cost is currently equal to......... 2535677.70442
The cost is currently equal to.

## Get Predictions

In [487]:
predictions = X * theta.T + Ymean

In [488]:
predictions

matrix([[ 0.03219707,  0.08143545, -0.01909967, ...,  0.03826407,
          0.03001919,  0.03025337],
        [ 0.02934303, -0.10606007,  0.0897014 , ..., -0.01424757,
         -0.0428075 ,  0.00801179],
        [ 0.07108842,  0.02679396,  0.08252031, ..., -0.00527034,
          0.10147134, -0.00545042],
        ..., 
        [ 0.04187484,  0.15184913,  0.08107561, ...,  0.07901171,
          0.18237418,  0.03491847],
        [ 0.23712873, -0.10628723,  0.08764696, ...,  0.0498105 ,
          0.11735341,  0.0549538 ],
        [ 0.20585214,  0.04080691,  0.01113103, ...,  0.04272989,
          0.1060568 ,  0.08648696]])

## Evaluate Recommender System

In [489]:
# compare the Y_test to predicted values
Y_test.columns

Index(['pIuq0cbN90agqm0z5pxF5A', 'v6VJhNpS6Snke33MqKBUhQ',
       'FLdun6KWwAh-gC8VHVZGCw', 'qsdIra27jSpnmCNjJubnrQ',
       'qnb62vcj2cNakytjEPunow', '8bwVOeK3Zylt6PnKf-KNnw',
       'kkddWkOVA2FGgt4ejO1gqA', '3RsG7Fjx93YQyqwR0ONfkQ',
       'xoBrFHHEzmXYNLwMEuc0TQ', 'F5WW6egmmUrIWRI_ZNpbEA',
       ...
       'eWxE1YjGYkeuJ-ymRAdwQA', 'LZJhdOO3PrHCQ4KeIncrCg',
       'QSQeU6bDea42m6yH_pOQ8A', 'ilV4uebbkirD2AiNIouHhw',
       'RyKF9WpyxDikoF6ADsHMDA', 'GPIV41wpKGS5XNvQeBtYjQ',
       'LnZvcTBq8svbDnjGEEFZMA', '317zjwBsgGRcqP9hlhcGOw',
       'ZzXsskFHBYlHigIC0mX9lg', 'SX5b6l8Isbi7VbjK-eilIQ'],
      dtype='object', name='user_id', length=2770)

In [496]:
Y.iloc[:3,:10]

user_id,--4TkB_iDShmg41Y_QW9nw,--65q1FpAL_UQtVZ2PTGew,--VxRvXk3b8FwsSbC2Zpxw,-0itF0VWVBe3k2AdfUReGA,-2EuoueswhqEERWezJY8gw,-2PFmYGWB6rQhZtRh6PlPA,-2r-OO0dtcCQQ3BLYdHXlg,-3HkgwyxVPf49MXM4dT8Tw,-3TxtJo7vjXYM3ROugCO_Q,-3akdU5UTDn6dwiTCO2cAw
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
--5jkZ3-nUPZxUvtcbr8Uw,,,,,,,,,,
--BlvDO_RG2yElKu9XA1_g,,,,,,,,,,
--UE_y6auTgq3FXlvUMkbw,,,,,,,,,,


In [508]:
predictions[predictions[:,1].argsort(axis=0)[::-1],1]

matrix([[ 1.2509688 ],
        [ 1.21844291],
        [ 0.89994739],
        ..., 
        [-0.48679954],
        [-0.50355716],
        [-0.9966353 ]])

In [509]:
predictions[:,1].max()

1.2509687957789286

In [511]:
Y.iloc[:,1].max()

5.0

# Testing the cost function with Andrew Ng's Example

In [443]:
# Testing the cost function and implementation on a dataset with a known answer(Andrew Ng's)
def testing():
    from scipy.io import loadmat
    data = loadmat('ex8_movies.mat')
#     Y = data['Y']
#     R = data['R']
#     Y.shape, R.shape
#     users = 4  
#     movies = 5  
#     features = 3

#     params_data = loadmat('ex8_movieParams.mat')  
#     X = params_data['X']  
#     Theta = params_data['Theta']

#     X_sub = X[:movies, :features]  
#     Theta_sub = Theta[:users, :features]  
#     Y_sub = Y[:movies, :users]  
#     R_sub = R[:movies, :users]
#     params = np.concatenate((np.ravel(X_sub), np.ravel(Theta_sub)))
    

    
    movie_idx = {}  
    f = open('movie_ids.txt')  
    for line in f:  
        tokens = line.split(' ')
        tokens[-1] = tokens[-1][:-1]
        movie_idx[int(tokens[0]) - 1] = ' '.join(tokens[1:])

    ratings = np.zeros((1682, 1))

    ratings[0] = 4  
    ratings[6] = 3  
    ratings[11] = 5  
    ratings[53] = 4  
    ratings[63] = 5  
    ratings[65] = 3  
    ratings[68] = 5  
    ratings[97] = 2  
    ratings[182] = 4  
    ratings[225] = 5  
    ratings[354] = 5
    
    R = data['R']
    Y = data['Y']
    Y = np.append(Y, ratings, axis=1)  
    R = np.append(R, ratings != 0, axis=1)  
    from scipy.optimize import minimize

    movies = Y.shape[0]  
    users = Y.shape[1]  
    features = 10  
    learning_rate = 10.

    X = np.random.random(size=(movies, features))  
    Theta = np.random.random(size=(users, features))  
    params = np.concatenate((np.ravel(X), np.ravel(Theta)))

    Ymean = np.zeros((movies, 1))  
    Ynorm = np.zeros((movies, users))

    for i in range(movies):  
        idx = np.where(R[i,:] == 1)[0]
        Ymean[i] = Y[i,idx].mean()
        Ynorm[i,idx] = Y[i,idx] - Ymean[i]

    fmin = minimize(fun=cost, x0=params, args=(Ynorm, R, movies, users, features, learning_rate),  
                    method='CG', jac=True, options={'maxiter': 100, 'disp':True})
    print(fmin)
    X = np.matrix(np.reshape(fmin.x[:movies * features], (movies, features)))  
    Theta = np.matrix(np.reshape(fmin.x[movies * features:], (users, features)))
    
    predictions = X * Theta.T
    my_preds = predictions[:, -1] + Ymean
    idx = np.argsort(my_preds, axis=0)[::-1]  
    print("Top 10 movie predictions:")  
    for i in range(10):  
        j = int(idx[i])
        print('Predicted rating of {0} for movie {1}.'.format(str(float(my_preds[j])), movie_idx[j]))
    #return cost(params, pd.DataFrame(Y_sub), pd.DataFrame(R_sub), features, 1.5)

In [444]:
testing()

The cost is currently equal to......... 434829.17492
The cost is currently equal to......... 434829.17492
The cost is currently equal to......... 408754.420477
The cost is currently equal to......... 319567.708067
The cost is currently equal to......... 141671.966108
The cost is currently equal to......... 826367.504943
The cost is currently equal to......... 104206.23712
The cost is currently equal to......... 86914.1501693
The cost is currently equal to......... 67517.3621203
The cost is currently equal to......... 70794.9155707
The cost is currently equal to......... 60246.3512986
The cost is currently equal to......... 52793.4544583
The cost is currently equal to......... 50083.9248201
The cost is currently equal to......... 48203.6543201
The cost is currently equal to......... 45924.5158052
The cost is currently equal to......... 45100.3193639
The cost is currently equal to......... 44979.2361376
The cost is currently equal to......... 43544.4022095
The cost is currently equal to.

## Check some values after conversion

In [143]:
Y_mat[1,2]

nan

In [160]:
Y.first_valid_index()

'--5jkZ3-nUPZxUvtcbr8Uw'

In [170]:
Y.loc['--5jkZ3-nUPZxUvtcbr8Uw'].first_valid_index()

'0BT613A1PBxuwxUzjEb6VQ'

In [169]:
Y.loc['--5jkZ3-nUPZxUvtcbr8Uw','0BT613A1PBxuwxUzjEb6VQ']

4.0

In [167]:
Y.index.get_loc('--5jkZ3-nUPZxUvtcbr8Uw')

0

In [168]:
Y.loc['--5jkZ3-nUPZxUvtcbr8Uw'].index.get_loc('0BT613A1PBxuwxUzjEb6VQ')

162

In [187]:
Y_mat[0,227]

4.0

In [171]:
Y_mat[0,0]

NameError: name 'Y_mat' is not defined

In [171]:
Y_mat = np.nan_to_num(Y_mat)

In [178]:
np.isnan(Y_mat).sum()

0

# Using a sparse matrix:
http://stackoverflow.com/questions/31661604/efficiently-create-sparse-pivot-tables-in-pandas

In [96]:
from scipy.sparse import csr_matrix

users = list(sorted(rev_biz_usr.user_id.unique()))
businesses = list(sorted(rev_biz_usr.business_id.unique()))

stars = rev_biz_usr['stars_x'].tolist()
row = rev_biz_usr.user_id.astype('category', categories=users).cat.codes
col = rev_biz_usr.business_id.astype('category', categories=businesses).cat.codes
sparse_matrix = csr_matrix((stars, (row, col)), shape=(len(users), len(businesses)))

In [37]:
sparse_matrix

NameError: name 'sparse_matrix' is not defined

In [196]:
sparse_matrix

<144265x25629 sparse matrix of type '<class 'numpy.int32'>'
	with 1201156 stored elements in Compressed Sparse Row format>

In [197]:
len(sparse_matrix.indptr)

144266

## The R matrix can be easily obtained by first converting the csr to coo and using the ij values

```
sr_matrix.tocoo(copy=True)

    Convert this matrix to COOrdinate format.

    With copy=False, the data/indices may be shared between this matrix and the resultant coo_matrix.
```

In [199]:
coo_mat = sparse_matrix.tocoo()

In [200]:
len(coo_mat.row)

1201156

In [201]:
R = list(zip(coo_mat.row,coo_mat.col))

In [203]:
sorted(R)

[(0, 15977),
 (1, 6023),
 (1, 10013),
 (1, 12667),
 (1, 17546),
 (1, 19296),
 (1, 22532),
 (2, 24831),
 (3, 447),
 (3, 3221),
 (3, 8781),
 (3, 11898),
 (4, 652),
 (4, 4332),
 (4, 4645),
 (4, 4721),
 (4, 4984),
 (4, 5948),
 (4, 6324),
 (4, 7945),
 (4, 8192),
 (4, 9895),
 (4, 10773),
 (4, 10826),
 (4, 11546),
 (4, 12097),
 (4, 13059),
 (4, 13246),
 (4, 13502),
 (4, 13605),
 (4, 14262),
 (4, 14961),
 (4, 15180),
 (4, 15795),
 (4, 16131),
 (4, 16960),
 (4, 18093),
 (4, 18538),
 (4, 19564),
 (4, 19983),
 (4, 21848),
 (4, 23434),
 (5, 1484),
 (5, 1592),
 (5, 2213),
 (5, 2281),
 (5, 9698),
 (5, 10180),
 (5, 12386),
 (5, 14886),
 (5, 15513),
 (5, 18589),
 (5, 18843),
 (5, 22498),
 (5, 24981),
 (6, 363),
 (6, 814),
 (6, 831),
 (6, 874),
 (6, 900),
 (6, 972),
 (6, 1243),
 (6, 1571),
 (6, 1808),
 (6, 1831),
 (6, 1917),
 (6, 1931),
 (6, 2232),
 (6, 2613),
 (6, 2631),
 (6, 2694),
 (6, 3016),
 (6, 3102),
 (6, 3165),
 (6, 4110),
 (6, 4682),
 (6, 4711),
 (6, 4754),
 (6, 5099),
 (6, 5401),
 (6, 5603),


# Check if cities and distances between them may be used instead of distances between businesses

##  No: cities are small, with distances that can be 3 miles

# assume 30 km as maximum distance willing to travel, to find vicinity within state

In [40]:
business.latitude.min(), business.latitude.max()

(32.865882499999998, 56.033777299999997)

In [59]:
business.longitude.min(), business.longitude.max()

(-115.38655, 8.5492494004572208)

In [41]:
vincenty((57, -71), (57, -72)).km

60.77164272792849

In [42]:
vincenty((32, -71), (32, -72)).km

94.49280502196765

In [58]:
vincenty((45, -71), (45, -72)).km

78.8463347093115

for out dataset, businesses lie within latitudes 32 and 56 North

where 1 degree in longitude ranges between 60 to 95 km

so nearby business ( < 30) should also be within 0.5 degree of longitude (near 32N distances will be < 47km)

for latitudes, one degree difference is about 110 km, so will adopt 0.3 difference to consider nearby businesses

In [53]:
vincenty((44, -71), (45, -71)).km

111.12200826241184

In [54]:
vincenty((44, -70), (45, -70)).km

111.12200826241184

In [55]:
vincenty((55, -71), (56, -71)).km

111.33269933263972

In [56]:
vincenty((32, -71), (33, -71)).km

110.89560891986069

In [65]:
business[business.latitude > 56.033]

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
47693,{},4Yh57FsAyvDtZsq5F7fCWg,"[Hotels & Travel, Public Transportation, Trans...",Inverkeithing,Boreland Rd\nInverkeithing KY11 1NJ,{},56.033777,-3.393267,Inverkeithing Railway Station,[],True,3,3.0,FIF,business


In [101]:
business.groupby(['state']).filter(lambda r: len(r) > 20).state.unique()

array(['PA', 'NC', 'SC', 'WI', 'IL', 'AZ', 'NV', 'QC', 'ON', 'EDH', 'MLN',
       'BW'], dtype=object)

In [102]:
# ignore states with less than 20 businesses... we have 12 states with more than 20 businesses
business_state = business.groupby(['state']).filter(lambda r: len(r) >= 20)

In [103]:
business_state.groupby('state').size()

state
AZ     36500
BW      1055
EDH     3297
IL       808
MLN      161
NC      6835
NV     23591
ON       530
PA      4086
QC      5591
SC       325
WI      3066
dtype: int64

In [106]:
business_state.shape

(85845, 15)

In [111]:
business_state[business_state.review_count > 20].count()

attributes       25673
business_id      25673
categories       25673
city             25673
full_address     25673
hours            25673
latitude         25673
longitude        25673
name             25673
neighborhoods    25673
open             25673
review_count     25673
stars            25673
state            25673
type             25673
dtype: int64

In [127]:
len(business_state[business.state == 'AZ'].merge(review, how='inner', left_on='business_id', right_on='business_id').user_id.unique())

  if __name__ == '__main__':


247028

In [168]:
# To classify a business as Food:
business.attributes.iloc[3]['Good For']

{'breakfast': False,
 'brunch': False,
 'dessert': False,
 'dinner': False,
 'latenight': False,
 'lunch': True}

In [166]:
sum(business.attributes.iloc[3]['Good For'].values())

1

In [76]:
business_state.longitude.max()

8.5492494004572208

In [70]:
business_groups = business_state.groupby('state')

In [71]:
group = business_groups.get_group('EDH')

In [72]:
group.latitude.max()

56.000421000000003

In [766]:
nearby ={}
max_nearbies = 0

business_groups = business_state.groupby('state')

for name, group in business_groups:
    for i in range(0, len(group)):
        if group.latitude.iloc[i] < 80:
            near_group = group[(group.latitude < (group.latitude.iloc[i] + 1)) & (group.latitude > (group.latitude.iloc[i] - 1)) & 
                          (group.longitude < (group.longitude.iloc[i] + 1)) & (group.longitude > (group.longitude.iloc[i] - 1))]
        for j in range(0, len(near_group)):

            distance = vincenty(tuple(group.iloc[i][['latitude','longitude']]), tuple(near_group.iloc[j][['latitude','longitude']])).km
            if distance < 30:
                c += 1
                try:
                    nearby[group.business_id.iloc[i]].extend([near_group.business_id.iloc[j]])
                except KeyError as e:
                    nearby[group.business_id.iloc[i]] = [near_group.business_id.iloc[j]]
                try:
                    nearby[near_group.business_id.iloc[j]].extend([group.business_id.iloc[i]])
                except KeyError as e:
                    nearby[near_group.business_id.iloc[j]] = [group.business_id.iloc[i]]
        if max_nearbies < c:
            max_nearbies = c
    print (len(group))


KeyboardInterrupt: 

In [755]:
business_state[(abs(business_state.latitude) < (business_state.latitude[0] + 1))]

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{'Take-out': True, 'Has TV': False, 'Noise Lev...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Tuesday': {'open': '11:00', 'close': '21:00'...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business
1,"{'Outdoor Seating': False, 'Good For Groups': ...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.014660,Cool Springs Golf Center,[],False,5,2.5,PA,business
3,"{'Good For Groups': True, 'Has TV': True, 'Smo...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'open': '10:00', 'close': '19:00'...",40.408830,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business
4,"{'Parking': {'valet': False, 'validated': Fals...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Thursday': {'open': '11:00', 'close': '20:00...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business
5,"{'Good For Groups': True, 'Has TV': True, 'Smo...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106","{'Monday': {'open': '11:00', 'close': '02:00'}...",40.415486,-80.067549,Alexion's Bar & Grill,[Greentree],True,23,4.0,PA,business
6,{'Accepts Credit Cards': True},b12U9TFESStdy7CsTtcOeg,"[Auto Repair, Automotive, Tires]",Carnegie,"718A Hope Hollow Rd\nCarnegie, PA 15106","{'Monday': {'open': '07:30', 'close': '18:00'}...",40.394588,-80.084454,Flynn's Tire & Auto Service,[],True,9,2.5,PA,business
7,{'Good for Kids': True},Sktj1eHQFuVa-M4bgnEh8g,"[Active Life, Mini Golf]",Carnegie,"920 Forsythe Rd\nCarnegie\nCarnegie, PA 15106",{},40.405404,-80.076267,Forsythe Miniature Golf & Snacks,[Carnegie],False,4,4.0,PA,business
8,{},3ZVKmuK2l7uXPE6lXY4Dbg,"[Roofing, Home Services, Decks & Railing, Cont...",Carnegie,"8 Logan St\nCarnegie\nCarnegie, PA 15106",{},40.406281,-80.090390,Quaker State Construction,[Carnegie],True,3,2.5,PA,business
9,{},QoDa50dc7g62xciFygXB9w,"[Veterinarians, Pets]",Pittsburgh,"2080 Greentree Rd\nPittsburgh, PA 15220",{},40.392207,-80.069134,Greentree Animal Clinic,[],True,7,4.0,PA,business


In [None]:
len(nearby)

In [None]:
import pickle
# save the nearby dictionary as it take a long time to build
with open('nearby_businesses_dict.pickle', 'wb') as handle:
    pickle.dump(nearby, handle, protocol=pickle.HIGHEST_PROTOCOL)
# To load the dictionary un-comment the following:___
# with open('nearby_businesses_dict.pickle', 'rb') as handle:
#     b = pickle.load(handle)

In [534]:
# nearby ={}
# max_nearbies = 0
# business_groups = business_state.groupby('state')
# for group in business_groups:
    
    
    
# for i in range(0, len(business)):
#     nearby[business.business_id.iloc[i]]=[]
#     c = 0
#     for j in range(0, len(business)):
#         if i==j: continue
#         distance = vincenty(tuple(business.iloc[i][['latitude','longitude']]), tuple(business.iloc[j][['latitude','longitude']])).km
#         if distance < 30:
#             c += 1
#             nearby[business.business_id.iloc[i]].extend([business.business_id.iloc[j]])        
#     if c > max_nearbies:
#         max_nearbies = c

In [170]:
pd.__version__

'0.19.2'