In [1]:
# Import deps
import gzip
import pprint
import boto3
import ujson
import pandas as pd
import numpy as np
## Download and parse the incoming data
### The data are [here](s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz)

In [2]:
!mkdir data
s3_client = boto3.client('s3')
bucket = 'aida-practice-data'
prefix = '/Users/shah/projects/vision_critical/apache_spark/yelp/data'

# List all objects within a S3 bucket path
response = s3_client.list_objects(Bucket = bucket)

# Loop through each file
for file in response['Contents']:

    # Get the file name
    name = file['Key'].rsplit('/', 1)
    
    if '.gz' not in name[1]:
        print('skipping dir ' + str(name))
        continue
    
    # Download each file to local disk
    s3_client.download_file(bucket, file['Key'], prefix + '/' + name[1])

skipping dir ['yelp', '']
skipping dir ['yelp/business', '']
skipping dir ['yelp/checkin', '']
skipping dir ['yelp/photos', '']
skipping dir ['yelp/reviews', '']
skipping dir ['yelp/tip', '']
skipping dir ['yelp/user', '']


In [3]:
!mkdir data/review && mv data/review.* data/review
!mkdir data/business && mv data/business.* data/business
!mkdir data/checkin && mv data/checkin.* data/checkin
!mkdir data/photos && mv data/photos.* data/photos
!mkdir data/tip && mv data/tip.* data/tip 
!mkdir data/user && mv data/user.* data/user

!ls -la data

total 0
drwxr-xr-x   8 shah  staff   272 20 Dec 13:50 [34m.[m[m
drwxr-xr-x   9 shah  staff   306 20 Dec 13:50 [34m..[m[m
drwxr-xr-x   6 shah  staff   204 20 Dec 13:50 [34mbusiness[m[m
drwxr-xr-x   5 shah  staff   170 20 Dec 13:50 [34mcheckin[m[m
drwxr-xr-x   6 shah  staff   204 20 Dec 13:50 [34mphotos[m[m
drwxr-xr-x  97 shah  staff  3298 20 Dec 13:50 [34mreview[m[m
drwxr-xr-x  23 shah  staff   782 20 Dec 13:50 [34mtip[m[m
drwxr-xr-x  26 shah  staff   884 20 Dec 13:50 [34muser[m[m


In [2]:
# read in all the compressed business data
with gzip.open('data/business/business.json1.gz', 'rb') as f:
    file_content = f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json2.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json3.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list
with gzip.open('data/business/business.json4.gz', 'rb') as f:
    file_content += f.readlines() # reads each line as bytes and places in list

# stringify each file_content list element to a str and strip \n
def cleanse_file_content_list():
    for line in range(0, len(file_content)):
        line_str = file_content[line].decode('utf-8')
        file_content[line] = line_str.rstrip('\n')
        file_content[line] = ujson.loads(file_content[line])
        
cleanse_file_content_list()

In [3]:
# Read file_contents into dataframe and change column 'city' string to lowercase
df = pd.DataFrame(file_content)
df.info()
df['city'] = df['city'].str.lower()
# remove & drop empty values
df['city'].replace('', np.nan, inplace=True)
df.dropna(subset=['city'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 15 columns):
address         156639 non-null object
attributes      156639 non-null object
business_id     156639 non-null object
categories      156639 non-null object
city            156639 non-null object
hours           156639 non-null object
is_open         156639 non-null int64
latitude        156638 non-null float64
longitude       156638 non-null float64
name            156639 non-null object
neighborhood    156639 non-null object
postal_code     156639 non-null object
review_count    156639 non-null int64
stars           156639 non-null float64
state           156639 non-null object
dtypes: float64(3), int64(2), object(10)
memory usage: 17.9+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 156636 entries, 0 to 156638
Data columns (total 15 columns):
address         156636 non-null object
attributes      156636 non-null object
business_id     156636 non-null object
categories 

In [4]:
# extract 'city' & 'stars' column to new dataframe
city_df = df[['city', 'stars']]
city_df.info()
city_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 156636 entries, 0 to 156638
Data columns (total 2 columns):
city     156636 non-null object
stars    156636 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.6+ MB


Unnamed: 0,city,stars
0,richmond heights,2.0
1,charlotte,4.5
2,toronto,4.5
3,scottsdale,3.0
4,phoenix,4.5
5,phoenix,5.0
6,phoenix,5.0
7,pittsburgh,2.5
8,mcmurray,5.0
9,gilbert,5.0


In [5]:
# groupby city and aggregate ratings
grouped_city_df = city_df.groupby('city', as_index=False).agg({'stars': 'mean'})
grouped_city_df

Unnamed: 0,city,stars
0,110 las vegas,5.000000
1,aberdour,4.000000
2,aberlady,4.250000
3,agincourt,2.500000
4,ahwahtukee,5.000000
5,ahwatukee,3.875000
6,ahwatukee foothills village,5.000000
7,aichwald,3.500000
8,ajax,3.303398
9,alburg,5.000000


In [6]:
# One hot embed city strings
dummy_cities_df = pd.get_dummies(grouped_city_df['city'])
# Concatenate aggregated
aggr_city_df = pd.concat([grouped_city_df, dummy_cities_df], axis=1)
aggr_city_df

Unnamed: 0,city,stars,110 las vegas,aberdour,aberlady,agincourt,ahwahtukee,ahwatukee,ahwatukee foothills village,aichwald,...,wolfschlugen,woodbridge,woodbridge (vaughan),woodmere,woodmere village,york,york regional municipality,youngtown,île des soeurs,île-des-soeurs
0,110 las vegas,5.000000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,aberdour,4.000000,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,aberlady,4.250000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,agincourt,2.500000,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ahwahtukee,5.000000,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,ahwatukee,3.875000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ahwatukee foothills village,5.000000,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,aichwald,3.500000,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,ajax,3.303398,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,alburg,5.000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(aggr_city_df.iloc[:,2:], aggr_city_df['stars'], test_size=0.33, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(aggr_city_df['city'], aggr_city_df['stars'], test_size=0.33, random_state=42)


In [10]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [11]:
print(regr.feature_importances_)

[ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.

In [12]:
print(regr.score(X_test, y_test))

-0.00190863446355


In [13]:
from sklearn.tree import DecisionTreeRegressor
dt_regr = RandomForestRegressor(max_depth=2, random_state=0)
dt_regr.fit(X_train, y_train)
print(dt_regr.score(X_test, y_test))

-0.00190863446355


In [82]:
from sklearn.model_selection import KFold
X = np.array(aggr_city_df.iloc[:,2:])
y = np.array(aggr_city_df['stars'])
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


TRAIN: [195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302
 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
 429 430 431 432 433 434 435 436 437 438 439