# Project 5: Group Project
#### Author: Adam Pardo, Brandon Bergeron, Eric Bayless, Ramesh Babu

### 01 - Data Collection and Data Cleaning

Task:

Information: 

In [1]:
# import libraries here 
import pandas as pd
import os

In [2]:
# import Business.json
business_json_path = './data/yelp_academic_dataset_business.json'
business = pd.read_json(business_json_path, lines=True)

In [23]:
# function to pull business dataset by city, with or without sampling

def city_restaurants(business_json, city_name, samples=None):
    #--load restaurants in city_name
    location = business_json[business_json['city'] == city_name]
    location = location.dropna(subset=['categories'])
    
    #--keep restaurants with reviews between 100-300
    location_rest = location[location['categories'].str.contains('Restaurant')]
    location_rest_small = location_rest[(location_rest['review_count'] > 100) & (location_rest['review_count'] < 300)]
    location_rest_small.reset_index(drop=True, inplace=True)
    
    #--returns all restaurants if no samples
    if samples == None:
        location_rest_small.to_csv(f'./data/{city_name.replace(" ", "_")}_restaurants.csv', index = False)
        return location_rest_small
    
    else:
        #creation of sample restaurants
        sample_rest = pd.concat([location_rest_small[location_rest_small['is_open'] == 1].sample(samples), location_rest_small[location_rest_small['is_open'] == 0].sample(samples)])
        sample_rest.reset_index(drop=True, inplace=True)
    
        sample_rest.to_csv(f'./data/{city_name.replace(" ", "_")}_{samples*2}.csv', index = False)
        return sample_rest

In [32]:
# import Review.json
review_json_path = './data/yelp_academic_dataset_review.json'
size = 500_000
#rows = 1_000_000

review_reader = pd.read_json(review_json_path, lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [33]:
def business_reviews(review, business_filepath, output_filepath):
    df = pd.read_csv(business_filepath)
    chunk_list = []
    
    for chunk_review in review:
        chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
        chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
        chunk_merged = pd.merge(df, chunk_review, on='business_id', how='left')
        print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
        chunk_list.append(chunk_merged)
        
    df_reviews = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)
    
    df_reviews.dropna(subset=['text'], inplace=True)
    df_reviews.reset_index(drop=True, inplace=True)
    #uncomment if you want csv of business in given city
    df_reviews.to_csv(f'{output_filepath}', index = False)
    return df_reviews

# https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88

In [34]:
# function that joins city business information with yelp reviews
business_reviews(review_reader, './data/Las_Vegas_400.csv', 'Las_Vegas_400_reviews.csv')

5081 out of 500,000 related reviews
4630 out of 500,000 related reviews
5014 out of 500,000 related reviews
5035 out of 500,000 related reviews
4368 out of 500,000 related reviews
5158 out of 500,000 related reviews
5314 out of 500,000 related reviews
5193 out of 500,000 related reviews
5823 out of 500,000 related reviews
4862 out of 500,000 related reviews
5364 out of 500,000 related reviews
4758 out of 500,000 related reviews
4682 out of 500,000 related reviews
4807 out of 500,000 related reviews
4218 out of 500,000 related reviews
4293 out of 500,000 related reviews
477 out of 500,000 related reviews


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,user_id,review_stars,text,date
0,409qhf4RF0rlxcuaM576Qg,El Pollo Loco,5260 S Ft Apache,Las Vegas,NV,89148,36.094100,-115.296839,2.5,108,1,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Mexican, Fast Food","{'Monday': '0:0-0:0', 'Tuesday': '9:30-23:0', ...",GmXOSEbXy8JXmvo9hM5WWQ,2.0,Came here quickly today to grab a burrito with...,2015-09-09 23:09:37
1,409qhf4RF0rlxcuaM576Qg,El Pollo Loco,5260 S Ft Apache,Las Vegas,NV,89148,36.094100,-115.296839,2.5,108,1,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Mexican, Fast Food","{'Monday': '0:0-0:0', 'Tuesday': '9:30-23:0', ...",P8Rn03SOJg0NnExbOWhLZw,1.0,I really like el pollo loco but some things no...,2014-04-20 03:16:46
2,409qhf4RF0rlxcuaM576Qg,El Pollo Loco,5260 S Ft Apache,Las Vegas,NV,89148,36.094100,-115.296839,2.5,108,1,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Mexican, Fast Food","{'Monday': '0:0-0:0', 'Tuesday': '9:30-23:0', ...",qPVtjjp8sNQ32p9860SR9Q,3.0,El pollo loco is what it is this place is a co...,2017-01-15 00:43:57
3,409qhf4RF0rlxcuaM576Qg,El Pollo Loco,5260 S Ft Apache,Las Vegas,NV,89148,36.094100,-115.296839,2.5,108,1,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Mexican, Fast Food","{'Monday': '0:0-0:0', 'Tuesday': '9:30-23:0', ...",hk6B1ZdzI62jMGo6e2IuuA,1.0,Busy place with a lot of traffic.. the price i...,2018-05-19 03:45:05
4,409qhf4RF0rlxcuaM576Qg,El Pollo Loco,5260 S Ft Apache,Las Vegas,NV,89148,36.094100,-115.296839,2.5,108,1,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Mexican, Fast Food","{'Monday': '0:0-0:0', 'Tuesday': '9:30-23:0', ...",iNmUdP_AtwHIVMHAg6mMSA,1.0,My burrito was cold. My wife's chicken was dry...,2018-05-19 18:50:08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73273,1qTx9z8RMz7RIYKAfnpZ0A,Lillie's Asian Cuisine,129 E Fremont St,Las Vegas,NV,89101,36.170349,-115.144506,3.5,121,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Casinos, Restaurants, Chinese, Hotels, Arts & ...","{'Monday': '17:0-0:0', 'Tuesday': '17:0-0:0', ...",5xaFHey_NjidLtugo0mdAw,3.0,Sitting here for what seems like it has been f...,2019-10-13 04:25:49
73274,INzLc1Y0JNQzIHjPH2Nbeg,Fuzzy's Taco Shop,7080 S Rainbow Blvd,Las Vegas,NV,89118,36.060044,-115.241666,3.5,252,0,"{'RestaurantsTableService': 'False', 'Restaura...","Tex-Mex, Mexican, Sandwiches, Salad, Restaurants","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",acyl_5237y5MZMgnx_VeNA,5.0,I remember this chain from Texas and LOVE it!!...,2017-03-25 19:55:57
73275,INzLc1Y0JNQzIHjPH2Nbeg,Fuzzy's Taco Shop,7080 S Rainbow Blvd,Las Vegas,NV,89118,36.060044,-115.241666,3.5,252,0,"{'RestaurantsTableService': 'False', 'Restaura...","Tex-Mex, Mexican, Sandwiches, Salad, Restaurants","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",o-4WGmqWdNiiQFctHdjC8Q,4.0,Love Fuzzy's Taco Shop. It's become my new ad...,2017-01-11 19:53:55
73276,INzLc1Y0JNQzIHjPH2Nbeg,Fuzzy's Taco Shop,7080 S Rainbow Blvd,Las Vegas,NV,89118,36.060044,-115.241666,3.5,252,0,"{'RestaurantsTableService': 'False', 'Restaura...","Tex-Mex, Mexican, Sandwiches, Salad, Restaurants","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",HyDMhqf_ru13zau_OCyNBw,1.0,Found a part of a rubber glove in my tacos. W...,2017-08-15 19:23:56
