## Import python dependencies

In [106]:
import gzip
import pprint
import boto3
import ujson
import pandas as pd
import numpy as np
from math import sqrt

## Download data from AWS S3 buckets 
If you do not have access to the aida-practice-data bucket use the following public S3 bucket:
[s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz](s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz)

In [5]:
# Create data dir
!mkdir data

# Instantiate s3 client
s3_client = boto3.client('s3')
bucket = 'aida-practice-data'
prefix = './data'

# List all objects within a S3 bucket path
response = s3_client.list_objects(Bucket = bucket)

# Loop through each file
for file in response['Contents']:

    # Get the file name
    name = file['Key'].rsplit('/', 1)
    
    if '.gz' not in name[1]:
        print('skipping dir ' + str(name))
        continue
    
    # Download each file to local disk
    s3_client.download_file(bucket, file['Key'], prefix + '/' + name[1])

mkdir: cannot create directory ‘data’: File exists
skipping dir ['yelp', '']
skipping dir ['yelp/business', '']
skipping dir ['yelp/checkin', '']
skipping dir ['yelp/photos', '']
skipping dir ['yelp/reviews', '']
skipping dir ['yelp/tip', '']
skipping dir ['yelp/user', '']


## Collate data in appropriate directories

In [6]:
!mkdir data/review && mv data/review.* data/review
!mkdir data/business && mv data/business.* data/business
!mkdir data/checkin && mv data/checkin.* data/checkin
!mkdir data/photos && mv data/photos.* data/photos
!mkdir data/tip && mv data/tip.* data/tip 
!mkdir data/user && mv data/user.* data/user

!ls -la data

total 32
drwxrwxr-x 8 shah shah 4096 Dec 30 01:30 .
drwxrwxr-x 5 shah shah 4096 Dec 29 23:54 ..
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 business
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 checkin
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 photos
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 review
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 tip
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 user


## Read the business data from the compressed json*.gz into memory

In [107]:
# Iteratively ead in all the compressed business data into the file_content varia
with gzip.open('data/business/business.json1.gz', 'rb') as f:
    file_content = f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json2.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json3.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json4.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list

# Stringify each file_content list element to a str and strip \n
def cleanse_file_content_list():
    for line in range(0, len(file_content)):
        line_str = file_content[line].decode('utf-8')
        file_content[line] = line_str.rstrip('\n')
        file_content[line] = ujson.loads(file_content[line])
        
cleanse_file_content_list()

## Read file_contents into dataframe and normalize column 'city' strings to lowercase as well as drop empty values


In [113]:
# Create datafrom from file_contents
df = pd.DataFrame(file_content)

print('Business info dataframe prior to cleansing \n')
df.info()

df['city'] = df['city'].str.lower()

print('\nBusiness info dataframe after cleansing \n')
# Remove & drop empty values
df['city'].replace('', np.nan, inplace=True)
df.dropna(subset=['city'], inplace=True)
df.info()

Business info dataframe prior to cleansing 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 15 columns):
address         156639 non-null object
attributes      156639 non-null object
business_id     156639 non-null object
categories      156639 non-null object
city            156639 non-null object
hours           156639 non-null object
is_open         156639 non-null int64
latitude        156638 non-null float64
longitude       156638 non-null float64
name            156639 non-null object
neighborhood    156639 non-null object
postal_code     156639 non-null object
review_count    156639 non-null int64
stars           156639 non-null float64
state           156639 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 17.9+ MB

Business info dataframe after cleansing 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156636 entries, 0 to 156638
Data columns (total 15 columns):
address         156636 non-null object
a

## Extract the 'city' & 'stars' column to aggreate mean star venue rating by city

In [117]:
# Extract 'city' & 'stars' column to new dataframe
city_df = df[['city', 'stars']]
print('City DF Summary: \n',city_df.describe())
city_df.head()

City DF Summary: 
                stars
count  156636.000000
mean        3.647160
std         0.977645
min         1.000000
25%         3.000000
50%         3.500000
75%         4.500000
max         5.000000


Unnamed: 0,city,stars
0,richmond heights,2.0
1,charlotte,4.5
2,toronto,4.5
3,scottsdale,3.0
4,phoenix,4.5


In [118]:
# Groupby city and aggregate ratings
grouped_city_df = city_df.groupby('city', as_index=False).agg({'stars': 'mean'})
grouped_city_df.tail()

Unnamed: 0,city,stars
970,york,3.466292
971,york regional municipality,4.0
972,youngtown,4.011628
973,île des soeurs,4.0
974,île-des-soeurs,3.5


# TASK 1: City Model
The venues belong to different cities.  You can image that the ratings in some
cities are probably higher than others and use this as an estimator.

Build an estimator that uses `groupby` and `mean` to compute the
average rating in that city.  Use this as a predictor.

In [109]:
# Add index for to grouped_city_df for lookup
grouped_city_df.index = grouped_city_df['city']

# Returns the mean of all venue ratings for the city passed in as 'record'
def city_model(record):
    return grouped_city_df['stars'][record]

# Passing in the name of a city 
city_model('agincourt')

## **Question:** In the absence of any information about a city, what score would you assign a restaurant in that city?

### **Answer:** The score assigned in this case would be an mean of all the means of all city.

## TASK 2: Lat Long Model
You can imagine that a city-based model might not be sufficiently fine-grained.
For example, we know that some neighborhoods are trendier than others.  We
might consider a K Nearest Neighbors or Random Forest based on the latitude
longitude as a way to understand neighborhood dynamics.

You should implement a generic `ColumnSelectTransformer` that is passed which
columns to select in the transformer and use a non-linear model like
`sklearn.neighbors.KNeighborsRegressor` or
`sklearn.ensemble.RandomForestRegressor` as the estimator (why would you choose
a non-linear model?).  Bonus points if you wrap the estimator in
`grid_search.GridSearchCV` and use cross-validation to determine the optimal
value of the parameters.

In [126]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, columnList):
        # initialization code
        self.x = ['1']

    def fit(self, X, y=None):
        # fit the transformation
        # ...
        return self

    def transform(self, X):
        return self.x # transformation

In [129]:
p = ColumnSelectTransformer()
p.transform(X=['3'])

['1']

In [103]:
# Label encode cities
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
fitted_le = le.fit(grouped_city_df['city'])

In [34]:
# create a list of avg star rating to match city_df entries
mean_city_star = []
for i in city_df['city'].iteritems():
    mean_city_star.append(grouped_city_df.loc[i[1]]['stars'])

[3.4666666666666668,
 3.5779968245567613,
 3.5164310155594292,
 3.9424101198402131,
 3.6844979560551865,
 3.6844979560551865,
 3.6844979560551865,
 3.6555907172995781,
 3.3115942028985508,
 3.8082892416225751,
 3.7130040322580644,
 3.9424101198402131,
 3.6361641221374046,
 3.4729493891797558,
 3.5943396226415096,
 3.75,
 3.6361641221374046,
 3.6121181604565291,
 3.6121181604565291,
 3.7130040322580644,
 3.5604395604395602,
 3.6555907172995781,
 3.3445595854922279,
 3.8082892416225751,
 3.6361641221374046,
 3.805241379310345,
 3.5164310155594292,
 3.7229951690821257,
 3.6555907172995781,
 3.9424101198402131,
 3.7524271844660193,
 3.6554825319958493,
 3.3558951965065504,
 3.6361641221374046,
 3.2298701298701298,
 3.2298701298701298,
 3.6493055555555554,
 3.482532751091703,
 3.5164310155594292,
 3.7130040322580644,
 3.7130040322580644,
 3.6844979560551865,
 3.7515068493150685,
 3.3033980582524274,
 3.5164310155594292,
 3.2182875264270612,
 3.6844979560551865,
 3.4199999999999999,
 3.71300

In [89]:
# Label encode all cities in main DF & add mean stars column
df['mean_star'] = mean_city_star
df['le_cities'] = le.transform(df['city'])
mean_city_df = df[['city', 'mean_star','stars', 'le_cities']]

mean_city_df.head()

Unnamed: 0,city,mean_star,stars,le_cities
0,richmond heights,3.466667,2.0,692
1,charlotte,3.577997,4.5,133
2,toronto,3.516431,4.5,887
3,scottsdale,3.94241,3.0,783
4,phoenix,3.684498,4.5,641


In [53]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(aggr_city_df.iloc[:,2:], aggr_city_df['mean_star'], test_size=0.33, random_state=42)

In [97]:
# LASSO Regression
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha = 0.1)
lasso_reg.fit(grouped_city_df['le_cities'].reshape(-1,1), grouped_city_df['stars'])

  after removing the cwd from sys.path.


Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [100]:
#mean square error 
np.mean((lasso_reg.predict(mean_city_df['le_cities'].reshape(-1,1)) - mean_city_df['stars'])**2)

  


0.95874059026268721

In [63]:
# Ridge regression
ridge_reg = linear_model.Ridge(alpha = 0.5)
ridge_reg.fit(X_train, y_train)
np.mean((ridge_reg.predict(X_test) - y_test)**2)

0.0021867822669117596

In [67]:
# KFold Lasso Model 
from sklearn.model_selection import KFold
X = np.array(aggr_city_df.iloc[:,2:])
y = np.array(aggr_city_df['mean_star'])
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr = linear_model.Lasso(alpha = 0.1)
    lr.fit(X_train, y_train)
    
    print('Lasso Model Mean Square error: ',np.mean((lr.predict(X_test) - y_test)**2))

Lasso Model Mean Square error:  0.0264765871817
Lasso Model Mean Square error:  0.0258753733522
Lasso Model Mean Square error:  0.0263548062497
Lasso Model Mean Square error:  0.0253535645824
Lasso Model Mean Square error:  0.0260295039352


In [68]:
# Kfold Ridge Regression
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rr = linear_model.Ridge(alpha = 0.5)
    rr.fit(X_train, y_train)
    
    print('Ridge Regression Model Mean Square error: ',np.mean((rr.predict(X_test) - y_test)**2))

Ridge Regression Model Mean Square error:  0.00212346840167
Ridge Regression Model Mean Square error:  0.00178717258651
Ridge Regression Model Mean Square error:  0.00260168597457
Ridge Regression Model Mean Square error:  0.00275938849819
Ridge Regression Model Mean Square error:  0.002537667576
