In [72]:
# Import some libraries we'll need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math as math
import scipy.sparse as sps
from sklearn.model_selection import train_test_split

In [35]:
def read_review_data(file_name):
    review_data = pd.read_csv(file_name)
    print "Sample Data"
    print "-----------"
    print review_data.sample(5)
    return review_data

In [36]:
business_data = pd.read_csv('../business.csv')
business_data.head(1)

Unnamed: 0,hours.Wednesday,attributes.RestaurantsDelivery,attributes.Open24Hours,attributes.DogsAllowed,attributes.CoatCheck,postal_code,attributes.Smoking,hours.Thursday,attributes.DietaryRestrictions,city,...,attributes.RestaurantsAttire,hours.Sunday,attributes.GoodForMeal,attributes.GoodForDancing,attributes.AcceptsInsurance,attributes.RestaurantsReservations,attributes,attributes.RestaurantsTakeOut,attributes.BikeParking,attributes.OutdoorSeating
0,11:0-21:0,False,,,,T2E 6L6,,11:0-21:0,,Calgary,...,casual,,,,,True,"{'BusinessParking': ""{'garage': False, 'street...",True,False,False


In [37]:
business_data.set_index('business_id', inplace = True)

In [52]:
def get_restaurant_data(review_data, business_data):
    review_data['city'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'city'], axis=1)
    review_data['cat'] = review_data.apply(lambda x: business_data.loc[x['business_id'], 'categories'], axis=1)
    restaurant_reviews = review_data[review_data['cat'].str.contains("Restaurants",na=False)]
    return restaurant_reviews

In [47]:
def get_city_restaurant_data(city, restaurant_reviews):
    city_rest_data = restaurant_reviews.loc[review_data['city'] == city]
    return city_rest_data

In [91]:
def build_sparse_matrix(df):
    users = list(df['user_id'].unique())
    rests = list(df['business_id'].unique())
    data = df['stars'].tolist()
    row = df['user_id'].astype('category', categories=users).cat.codes
    col = df['business_id'].astype('category', categories=rests).cat.codes
    rating_matrix = csr_matrix((data, (row, col)), shape=(len(users), len(rests)))
    return rating_matrix

In [92]:
def split_train_test(df):
    '''Split for train and test data'''
    rating_matrix = build_sparse_matrix(df)
    train, test = split_rating_matrix(rating_matrix)
    num_train = train.shape[0]
    num_test = test.shape[0]
    
    print "Number of training samples: {}".format(num_train)
    print "Number of test samples: {}".format(num_test)
    
    return train, test

In [None]:
def get_sparse_matrix(df):
    '''Converts the df into a sparse ratings matrix'''
    unique_users = df['user_id'].unique().tolist()
    unique_rests = df['business_id'].unique().tolist()
    data = df['stars'].tolist()
    row = df['user_id'].astype('category', categories=unique_users).cat.codes
    col = df['business_id'].astype('category', categories=unique_bus).cat.codes
    sparse_matrix = csr_matrix((data, (row, col)), shape=(len(unique_users), len(unique_rests)))
    return sparse_matrix

In [93]:
def split_rating_matrix(rating_matrix, samples = 2):
    users, restaurants =  rating_matrix.nonzero()
    test = csr_matrix(rating_matrix.shape)
    train = rating_matrix.copy()

    num_users = rating_matrix.shape[0]
    for u in range(num_users):
        idx = restaurants[np.where(users == u)]
        np.random.shuffle(idx)
        test_idx = idx[-samples:]
        train_idx = idx[:-samples]
        test[u,test_idx] = rating_matrix[u,test_idx]
        train[u,test_idx] = 0

    data = np.array(train[train.nonzero()])[0]
    row = train.nonzero()[0]
    col = train.nonzero()[1]
    size = train.shape
    train = csr_matrix((data,(row,col)),shape = size)

    mult = train.multiply(test)
    assert(mult.nnz == 0)
    
    return train, test

In [94]:
def get_rating_matrix(city_review_data):
    unique_users = city_review_data['user_id'].unique().tolist()
    unique_rests = city_review_data['business_id'].unique().tolist()
    num_user = len(unique_users)
    num_rest = len(unique_rests)

    print "Number of Users: {}".format(num_user)
    print "Number of Restaurent: {}".format(num_rest)
    
    train, test = split_train_test(city_review_data)
    return train, test

In [100]:
#sparsity of rating matrix
def get_sparsity(rating_matrix):
    total_filled_values = rating_matrix.nnz
    return 1 - float(total_filled_values)/float(rating_matrix.shape[0]*rating_matrix.shape[1])

In [96]:
# reading the review data
review_data = read_review_data('15core.csv')
print '-----------------------------'
print 'Filtering the Restaurant data'
print '-----------------------------'
restaurant_reviews = get_restaurant_data(review_data, business_data)
print restaurant_reviews.head(3)
print '-----------------------------'
print 'restaurant reviews top 10 cities'
print '-----------------------------'
print restaurant_reviews['city'].value_counts().head(10)
print '-----------------------------'
toronto_15 = get_city_restaurant_data('Toronto', restaurant_reviews)
toronto_15_train, toronto_15_test = get_rating_matrix(toronto_15)

Sample Data
-----------
                        user_id             business_id  stars
1365815  51pZ_Jwdm4RG40TxjdpKGg  ZygcwoZYchTAaQFWiwinZg      4
921193   wMC90PiyisLn7omSU7jMhg  K8M3OeFCcAnxuxtTc0BQrQ      4
250235   _1xH3x70kO1l17cYftaHqA  1DP5vHxJzCDbgawv0WC3yA      3
204897   _hsruQCw6UTnjlQiuSpScg  SV8y4bp5HmgURzvUC2Rs9w      3
502093   1J2zVswrcASag0M34sVsFw  yNPh5SO-7wr8HPpVCDPbXQ      3
-----------------------------
Filtering the Restaurant data
-----------------------------
                  user_id             business_id  stars        city  \
0  0pf5VuzE4_1pwj5NJHG5TQ  vsFFbN71ehRCp46KeR5RdQ      5  Scottsdale   
1  0pf5VuzE4_1pwj5NJHG5TQ  Jj8ubiwwuCR-rrhrrjcryw      2     Phoenix   
2  0pf5VuzE4_1pwj5NJHG5TQ  YTbKmjGTdn4YzoJXTC1u7g      3     Gilbert   

                                                 cat  
0  American (Traditional), Breakfast & Brunch, Sa...  
1                            Vietnamese, Restaurants  
2  Restaurants, Beer, Wine & Spirits, Cafes, Acti...  

  This is separate from the ipykernel package so we can avoid doing imports until
  if sys.path[0] == '':


Number of training samples: 7103
Number of test samples: 7103


In [97]:
print toronto_15_train.shape
print toronto_15_test.shape

(7103, 2819)
(7103, 2819)


In [101]:
print get_sparsity(toronto_15_train)

0.993325245112


In [115]:
def fill_missing_with_mean(matrix):
    dense = matrix.toarray()
    ndf = pd.DataFrame(dense)
    ndf = ndf.replace(to_replace = 0, value = np.nan)
    ndf = ndf.fillna(ndf.mean())
    return ndf.values

In [122]:
from sklearn.preprocessing import scale
toronto_15_mean_filled = fill_missing_with_mean(toronto_15_train)
mean_centered = scale(toronto_15_mean_filled, axis = 1, with_std=False)

[[ 0.45900364 -0.54099636 -1.54099636 ... -0.31877414 -0.24687872
   0.74471792]
 [-0.11157784  0.04281088 -0.66839602 ... -0.32117379 -0.24927837
   0.74231827]
 [-0.10191182  0.05247689 -0.65873    ... -0.31150778 -0.23961236
   0.75198428]
 ...
 [-0.10891537  0.04547334 -0.66573356 ... -0.31851133 -0.24661591
   0.74498073]
 [-0.11149051  0.0428982  -0.6683087  ... -0.32108647 -0.24919105
   0.74240559]
 [-0.11167411  0.04271461 -0.66849229 ... -0.32127007 -0.24937464
   0.742222  ]]


In [127]:
from sklearn.decomposition import PCA
transformer = PCA(n_components='mle')
transformer.fit(mean_centered) 

fig = plt.figure(figsize=(8,5))
plt.plot(transformer.explained_variance_, '-', linewidth=1)
plt.title('Elbow Plot')
# plt.xticks(np.arange(0, S_phx_res.shape[0]+1, 50), rotation='vertical')
plt.xlabel('Principal Component')
plt.ylabel('Singular Values')
plt.show()

KeyboardInterrupt: 