## Part 0: Data Processing and EDA

### Importing Data

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import json
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegressionCV
from random import *
from math import log
from pandas.io.json import json_normalize
import copy
pd.set_option('display.max_columns', None)

from scipy.stats.stats import pearsonr   
%matplotlib inline

In [2]:
with open('dataset/business.json') as f:
    business_data = [json.loads(line) for line in f]
with open('dataset/user.json') as f:
    user_data = [json.loads(line) for line in f]  
# preprocessed review file with reviews only for restaurants 
with open('dataset/restaurant_reviews_trimmed.json') as f:
    review_data = [json.loads(line) for line in f]

# pull just restaurant data from business data
restaurant_data = [x for x in business_data if 'Restaurants' in x['categories']]

# convert array to list
restaurant_reviews = review_data[0]

## Part 1: Create a Baseline

### Getting global averages

In [3]:
user_total = [x['average_stars'] for x in user_data]
global_user_average = sum(user_total)/len(user_total)
print (global_user_average)

3.7118346541447185


In [4]:
restaurant_total = [x['stars'] for x in restaurant_data]
global_restaurant_average = sum(restaurant_total)/len(restaurant_total)
print (global_restaurant_average)

3.461104760428574


In [5]:
reviews_total = [x['stars'] for x in restaurant_reviews]
global_review_average = sum(reviews_total)/len(reviews_total)
print (global_review_average)

3.702161161664101


## Getting Global Averages

In [22]:
restaurant_dict = {}
for item in restaurant_data:
    restaurant_id = item['business_id']
    restaurant_dict[restaurant_id] = item

In [6]:
# for easy lookup based on user id
user_dict = {}
for item in user_data:
    user_id = item['user_id']
    user_dict[user_id] = item

In [7]:
user_deviations = {}
for item in user_data:
    user_id = item['user_id']
    user_deviations[user_id] = item['average_stars'] - global_user_average
    

In [8]:
restaurant_deviations = {}
for item in restaurant_data:
    rest_id = item['business_id']
    restaurant_deviations[rest_id] = item['stars'] - global_restaurant_average

### Rating the Model

In [9]:
#getting a random evaluation set of 20000
evaluation_set = np.random.choice(restaurant_reviews, size = 20000)


In [10]:
evaluation_df = pd.DataFrame(list(evaluation_set))

In [11]:
evaluation_df = evaluation_df.drop(['cool', 'date', 'funny','text','useful'], axis = 1)


In [12]:
def baseline(user_id, business_id):
    pred = global_review_average + user_deviations[user_id] + restaurant_deviations[business_id]
    return pred

In [13]:
evaluation_df['baseline_pred'] = [baseline(x,y) for (x,y) in zip(evaluation_df['user_id'],evaluation_df['business_id'])]

In [14]:
score = metrics.mean_squared_error(evaluation_df['stars'], evaluation_df['baseline_pred'])
print (score)

1.26203188939


In [15]:
del evaluation_df

## Part 2: Create a Regularized Regression

### Creating training, validating, and testing sets

In [16]:
# take 100000 reviews as sample
data_array = (np.random.choice(restaurant_reviews, size = 100000))
data_set = list(data_array)

In [17]:
# find all categories for one-hot encoding purposes
from collections import Counter
all_categories = []
for r in restaurant_data:
    if 'Restaurants' in r['categories']:
        for c in r['categories']:
            all_categories.append(c)

In [18]:
# take 150 most popular categories
counts = list (Counter(all_categories).items())
counts.sort(key=lambda x: x[1], reverse = True)
most_popular = [x[0] for x in counts[:150]]


In [19]:
expanded_reviews = copy.deepcopy(data_array)

In [23]:
# add business and user info

for review in expanded_reviews:
    #print (review)
    restaurant = review['business_id']
    user = review['user_id']
    restaurant_info = restaurant_dict[restaurant]
    #print (restaurant_info)
    user_info = user_dict[user]
    for attribute in restaurant_info:
        #print (attribute)
        if attribute not in ['is_open', 'latitude','longitude','name','business_id',
                             'neighborhood','address','city','postal_code','hours']:
            if attribute == 'categories':
                for c in most_popular:
                    if c in restaurant_info[attribute]:
                        review['R_' +  c] = 1
                    else:
                        review['R_' +  c] = 0
            else:         
                review['R_' + attribute] = restaurant_info[attribute]
    for attribute in user_info:
        if attribute not in ['user_id','name']:   
            if attribute == 'friends':
                review['U_friends'] = len(user_info[attribute])
            elif attribute == 'yelping_since':
                review['U_yelping_since'] = user_info[attribute][:4]
            elif attribute == 'elite':
                if user_info[attribute]:
                    review['U_elite'] = True
                else:
                    review['U_elite'] = False        
            else:
                review['U_' + attribute] = user_info[attribute] 

In [24]:
# create pandas dataframe
flatframe = json_normalize(expanded_reviews)
flatframe = flatframe.drop(['text','useful','funny','cool','date'], axis=1)

In [25]:
# change user since
flatframe['U_years_yelping'] = [2015 - int(x) for x in flatframe['U_yelping_since']]
flatframe.drop(['U_yelping_since'],axis = 1, inplace = True)

In [26]:
# drop ids
flatframe = flatframe.drop(['business_id', 'review_id', 'user_id'], axis = 1)

In [27]:
#one hot encode state
flatframe = pd.get_dummies(flatframe, columns = ['R_state'])

In [28]:
# train test split
msk = np.random.rand(len(flatframe)) < 0.5
data_train = flatframe[msk]
data_test = flatframe[~msk]

### Making the Model

In [31]:
from sklearn.linear_model import LinearRegression
Xtrain = data_train.drop(['stars'], axis = 1)
ytrain = data_train['stars']
Xtest = data_test.drop(['stars'], axis = 1)
ytest = data_test['stars']

model = LinearRegression()
model.fit(Xtrain, ytrain)

ValueError: could not convert string to float: 'no'

In [None]:
ypred = model.predict(Xtrain)
ypred_test = model.predict(Xtest)
predround = [int(round(x)) for x in ypred]
print ("The accuracy score of the linear model on the train set is {}"
       .format(metrics.accuracy_score(ytrain, predround)))
predround_test = [int(round(x)) for x in ypred_test]
print ("The accuracy score of the linear model on the test set is {}"
       .format(metrics.accuracy_score(ytest, predround_test)))

In [None]:
model_ridge = RidgeCV().fit(Xtrain, ytrain)
ridge_ypred = model_ridge.predict(Xtrain)
ridge_ypred_round = [int(round(x)) for x in ridge_ypred]
ridge_ypred_test = model_ridge.predict(Xtest)
ridge_ypred_test_round = [int(round(x)) for x in ridge_ypred_test]


print ("The accuracy score of the ridge model on the train set is {}"
       .format(metrics.accuracy_score(ytrain, ridge_ypred_round)))
print ("The accuracy score of the ridge model on the test set is {}"
       .format(metrics.accuracy_score(ytest, ridge_ypred_test_round)))


In [None]:
model_lasso = LassoCV().fit(Xtrain, ytrain)


lasso_ypred = model_lasso.predict(Xtrain)
lasso_ypred_round = [int(round(x)) for x in lasso_ypred]
lasso_ypred_test = model_lasso.predict(Xtest)
lasso_ypred_test_round = [int(round(x)) for x in lasso_ypred_test]

print ("The accuracy score of the lasso model on the train set is {}"
       .format(metrics.accuracy_score(ytrain, lasso_ypred_round)))
print ("The accuracy score of the lasso model on the test set is {}"
       .format(metrics.accuracy_score(ytest, lasso_ypred_test_round)))

In [None]:
base_df = flatframe[['stars', 'business_id', 'user_id']]

train_base = base_df[msk]
test_base = base_df[~msk]

base_pred = [baseline(x,y) for x,y in zip(train_base['user_id'],train_base['business_id'])]
base_pred_test = [baseline(x,y) for x,y in zip(test_base['user_id'],test_base['business_id'])]

print ("The accuracy score of the baseline model on the train set is {}"
       .format(metrics.accuracy_score(ytrain, base_pred)))
print ("The accuracy score of the baseline model on the test set is {}"
       .format(metrics.accuracy_score(ytest, base_pred_test)))


base_pred_2 = [baseline2(x,y) for x,y in zip(train_base['user_id'],train_base['business_id'])]
base_pred_test_2 = [baseline2(x,y) for x,y in zip(test_base['user_id'],test_base['business_id'])]
print ("The accuracy score of the baseline2 model on the train set is {}"
       .format(metrics.accuracy_score(ytrain, base_pred_2)))
print ("The accuracy score of the baseline2 model on the test set is {}"
       .format(metrics.accuracy_score(ytest, base_pred_test_2)))
