## Import python dependencies

In [1]:
import gzip
import pprint
import boto3
import ujson
import pandas as pd
import numpy as np
from math import sqrt

## Download data from AWS S3 buckets 
If you do not have access to the aida-practice-data bucket use the following public S3 bucket:
[s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz](s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz)

In [5]:
# Create data dir
!mkdir data

# Instantiate s3 client
s3_client = boto3.client('s3')
bucket = 'aida-practice-data'
prefix = './data'

# List all objects within a S3 bucket path
response = s3_client.list_objects(Bucket = bucket)

# Loop through each file
for file in response['Contents']:

    # Get the file name
    name = file['Key'].rsplit('/', 1)
    
    if '.gz' not in name[1]:
        print('skipping dir ' + str(name))
        continue
    
    # Download each file to local disk
    s3_client.download_file(bucket, file['Key'], prefix + '/' + name[1])

mkdir: cannot create directory ‘data’: File exists
skipping dir ['yelp', '']
skipping dir ['yelp/business', '']
skipping dir ['yelp/checkin', '']
skipping dir ['yelp/photos', '']
skipping dir ['yelp/reviews', '']
skipping dir ['yelp/tip', '']
skipping dir ['yelp/user', '']


## Collate data in appropriate directories

In [6]:
!mkdir data/review && mv data/review.* data/review
!mkdir data/business && mv data/business.* data/business
!mkdir data/checkin && mv data/checkin.* data/checkin
!mkdir data/photos && mv data/photos.* data/photos
!mkdir data/tip && mv data/tip.* data/tip 
!mkdir data/user && mv data/user.* data/user

!ls -la data

total 32
drwxrwxr-x 8 shah shah 4096 Dec 30 01:30 .
drwxrwxr-x 5 shah shah 4096 Dec 29 23:54 ..
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 business
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 checkin
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 photos
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 review
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 tip
drwxrwxr-x 2 shah shah 4096 Dec 30 01:30 user


## Read the business data from the compressed json*.gz into memory

In [2]:
# Iteratively ead in all the compressed business data into the file_content varia
with gzip.open('data/business/business.json1.gz', 'rb') as f:
    file_content = f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json2.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json3.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json4.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list

# Stringify each file_content list element to a str and strip \n
def cleanse_file_content_list():
    for line in range(0, len(file_content)):
        line_str = file_content[line].decode('utf-8')
        file_content[line] = line_str.rstrip('\n')
        file_content[line] = ujson.loads(file_content[line])
        
cleanse_file_content_list()

## Read file_contents into dataframe and normalize column 'city' strings to lowercase as well as drop empty values


In [3]:
# Create datafrom from file_contents
df = pd.DataFrame(file_content)

print('Business info dataframe prior to cleansing \n')
df.info()

df['city'] = df['city'].str.lower()

print('\nBusiness info dataframe after cleansing \n')
# Remove & drop empty values
df['city'].replace('', np.nan, inplace=True)
df.dropna(subset=['city','latitude','longitude','stars'], inplace=True)
df.info()

Business info dataframe prior to cleansing 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 15 columns):
address         156639 non-null object
attributes      156639 non-null object
business_id     156639 non-null object
categories      156639 non-null object
city            156639 non-null object
hours           156639 non-null object
is_open         156639 non-null int64
latitude        156638 non-null float64
longitude       156638 non-null float64
name            156639 non-null object
neighborhood    156639 non-null object
postal_code     156639 non-null object
review_count    156639 non-null int64
stars           156639 non-null float64
state           156639 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 17.9+ MB

Business info dataframe after cleansing 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156635 entries, 0 to 156638
Data columns (total 15 columns):
address         156635 non-null object
a

## Extract the 'city' & 'stars' column to aggreate mean star venue rating by city

In [4]:
# Extract 'city' & 'stars' column to new dataframe
city_df = df[['city', 'stars']]
print('City DF Summary: \n',city_df.describe())
city_df.head()

City DF Summary: 
                stars
count  156635.000000
mean        3.647164
std         0.977647
min         1.000000
25%         3.000000
50%         3.500000
75%         4.500000
max         5.000000


Unnamed: 0,city,stars
0,richmond heights,2.0
1,charlotte,4.5
2,toronto,4.5
3,scottsdale,3.0
4,phoenix,4.5


In [5]:
# Groupby city and aggregate ratings
grouped_city_df = city_df.groupby('city', as_index=False).agg({'stars': 'mean'})
grouped_city_df.tail()

Unnamed: 0,city,stars
970,york,3.466292
971,york regional municipality,4.0
972,youngtown,4.011628
973,île des soeurs,4.0
974,île-des-soeurs,3.5


# TASK 1: City Model
The venues belong to different cities.  You can image that the ratings in some
cities are probably higher than others and use this as an estimator.

Build an estimator that uses `groupby` and `mean` to compute the
average rating in that city.  Use this as a predictor.

In [6]:
# Add index for to grouped_city_df for lookup
grouped_city_df.index = grouped_city_df['city']

# Returns the mean of all venue ratings for the city passed in as 'record'
def city_model(record):
    return grouped_city_df['stars'][record]

# Passing in the name of a city 
city_model('agincourt')

2.5

## **Question:** In the absence of any information about a city, what score would you assign a restaurant in that city?

### **Answer:** The score assigned in this case would be an mean of all the means of all city.

## TASK 2: Lat Long Model
You can imagine that a city-based model might not be sufficiently fine-grained.
For example, we know that some neighborhoods are trendier than others.  We
might consider a K Nearest Neighbors or Random Forest based on the latitude
longitude as a way to understand neighborhood dynamics.

You should implement a generic `ColumnSelectTransformer` that is passed which
columns to select in the transformer and use a non-linear model like
`sklearn.neighbors.KNeighborsRegressor` or
`sklearn.ensemble.RandomForestRegressor` as the estimator (why would you choose
a non-linear model?).  Bonus points if you wrap the estimator in
`grid_search.GridSearchCV` and use cross-validation to determine the optimal
value of the parameters.

In [126]:
# from sklearn.base import BaseEstimator, TransformerMixin

# class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    
#     def __init__(self, columnList):
#         # initialization code
#         self.columnList = columnList
        
#     def fit(self, X, y=None):
#         # fit the transformation
        
#         return self

#     def transform(self, X):
#         return self.x # transformation

In [None]:
# # Label encode cities
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
fitted_le = le.fit(grouped_city_df['city'])
df['le_city'] = le.transform(df['city'])

from sklearn.model_selection import KFold
X = np.array(df[['le_city','latitude','longitude']])
y = np.array(df['stars'])
kf = KFold(n_splits=5)

from sklearn.neighbors import KNeighborsRegressor
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    neigh_model = KNeighborsRegressor(n_neighbors=1)
    neigh_model.fit(X, y)
    
    print('KNeighborsRegressor Model RMSE: ',sqrt(np.mean((neigh_model.predict(X_test) - y_test)**2)))

knr_param_grid = {
    "n_neighbors":[1,2,3,4,5],
    "weights" : ["uniform","distance"],
    "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
    "p" : [1,2],
    "metric" : ["euclidean","manhattan","chebyshev","minkowski","seuclidean","mahalanobis"]
}

from sklearn.model_selection import GridSearchCV
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    optimized_knr_model = KNeighborsRegressor()
    knr_clf = GridSearchCV(optimized_knr_model, param_grid = knr_param_grid, n_jobs=3)
    knr_clf.fit(X, y)
    
    print('OPTIMIZED KNeighborsRegressor Model RMSE: ', sqrt(np.mean((knr_clf.predict(X_test) - y_test)**2)))

from sklearn.ensemble import RandomForestRegressor
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    rf_model = RandomForestRegressor(max_depth=2, random_state=0)
    rf_model.fit(X, y)
    
    print('RandomForest Model RMSE: ',sqrt(np.mean((rf_model.predict(X_test) - y_test)**2)))

# Use a full grid over all parameters
rf_param_grid = {
    "max_depth": [3, None],
    "max_features": [1, 3],
    "min_samples_split": [2, 3, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [True, False]
}

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    optimized_rf_model = RandomForestRegressor()
    rf_clf = GridSearchCV(optimized_rf_model, param_grid = rf_param_grid, n_jobs=3)
    clf.fit(X, y)
    
    print('OPTIMIZED RandomForest Model RMSE: ', sqrt(np.mean((rf_clf.predict(X_test) - y_test)**2)))

KNeighborsRegressor Model RMSE:  0.5397648776574409
KNeighborsRegressor Model RMSE:  0.5443068902216927
KNeighborsRegressor Model RMSE:  0.5415582562477849
KNeighborsRegressor Model RMSE:  0.5526223984840691
KNeighborsRegressor Model RMSE:  0.5789077899450875
