In [1]:
# Importing dependencies
import os
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np

In [2]:
# Path to source JSON
businessJson = os.path.join('sourceData', 'business.json')

In [3]:
# Path to Yelp food/restaurant categories csv
yelpCategories = os.path.join('sourceData', 'yelpCategories.csv')

In [4]:
# Creating pd dataframe
business = pd.read_json(businessJson, lines=True)

In [5]:
# Select only the businesses in Ontario
business_on = business.loc[business['state'] == 'ON']

In [6]:
# Dropping any rows with blank values in these categories
business_on_clean = business_on.dropna(subset=['name', 'address', 'postal_code', 'city', 'state', 'latitude', 'longitude', 'attributes',
                                                          'categories', 'hours']).reset_index(drop=True)

In [7]:
# Selecting all of the restaurants
# restaurant = business_on_clean[business_on_clean['categories'].str.contains('Restaurants')].reset_index(drop=True)
restaurant = business_on_clean

In [8]:
# Regex to fix spelling mistakes 
restaurant = restaurant.replace({'city': {'^AGINCOURT$': 'Agincourt',
                                            '^Bradford West Gwillimbury$': 'Bradford',
                                            '^East Ajax$': 'Ajax',
                                            '^Caledon.{,8}$': 'Caledon',
                                            '^East Gwil{1,2}imbury$': 'East Gwillimbury',
                                            '(?i)^.*icoke$': 'Etobicoke',
                                            '^.{,9}Toro?nto.{,9}$': 'Toronto',
                                            'Malton': 'Mississauga',
                                            '^.{,5}Missis{1,2}a?ua?g.{1,2}$': 'Mississauga',
                                            '^Regional Municipality of York$': 'North York',
                                            '(?i)^North.{0,2}York$': 'North York',
                                            '^York Regional Municipality$': 'York',
                                            '^Willowdale$': 'North York',
                                            '^North of Brampton$': 'Brampton',
                                            '(?i)^Oak.?ridges$': 'Oak Ridges',
                                            '^oakville$': 'Oakville',
                                            '(?i)^Richmond?.?Hill?$': 'Richmond Hill',
                                            '^.{,8}Scar.?bo?rough$': 'Scarborough',
                                            '^.{,11}Stouffville$': 'Stouffville',
                                            '(?i)^Thornhil{,2}$': 'Thornhill',
                                            '^.*Vaugh.{,3}$': 'Vaughan',
                                            '^Wh.?i.?by$': 'Whitby'}}, regex=True)

In [9]:
# Only taking these columns
restaurant = restaurant.loc[:, ['name', 'address', 'postal_code', 'city', 'state', 'latitude', 'longitude','categories', 'stars', 'hours', 'attributes']]
restaurant.columns = ['Name', 'Address', 'Postal_code', 'City', 'Province', 'Latitude', 'Longitude', 'Categories', 'Stars', 'Hours', 'Attributes']

In [10]:
hours_raw = pd.DataFrame(json_normalize(data=restaurant['Hours']))

In [11]:
# Reorganise columns
hours_raw = hours_raw.loc[:,['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]

In [12]:
# Create a new DF with opening and closing hours
columns=hours_raw.columns
hours = hours_raw

In [13]:
# Loop through and split the columns
for column in columns:
    hours[[f"{column}_open", f"{column}_close"]]=hours_raw[column].str.split('-', expand=True)
hours.drop(columns=columns, inplace=True)
hours = hours.apply(lambda x: x.str.strip())

In [14]:
# Add hours column to the original DF
restaurant = restaurant.join(hours)
# restaurant.drop(columns='Hours', inplace=True)
restaurant.tail()

Unnamed: 0,Name,Address,Postal_code,City,Province,Latitude,Longitude,Categories,Stars,Hours,...,Wednesday_open,Wednesday_close,Thursday_open,Thursday_close,Friday_open,Friday_close,Saturday_open,Saturday_close,Sunday_open,Sunday_close
20817,KOKO! Share Bar,81 Yorkville Avenue,M5R 1C1,Toronto,ON,43.670948,-79.391502,"Korean, Japanese, Asian Fusion, Restaurants",3.5,"{'Monday': '11:30-22:30', 'Tuesday': '11:30-22...",...,11:30,22:30,11:30,23:0,11:30,23:0,11:30,23:0,11:30,22:30
20818,Indian Hero,8920 Highway 50,L6P 3A3,Brampton,ON,43.775089,-79.653807,"Restaurants, Indian",3.0,"{'Monday': '11:30-22:0', 'Wednesday': '11:30-2...",...,11:30,22:0,11:30,22:0,11:30,23:45,12:0,23:45,12:0,22:0
20819,Thai Fantasy,578 Yonge Street,M4Y 1Z3,Toronto,ON,43.66512,-79.384809,"Restaurants, Thai",4.0,"{'Monday': '0:0-0:0', 'Tuesday': '11:0-23:0', ...",...,11:0,23:0,11:0,23:0,11:0,23:0,12:0,23:0,12:0,22:0
20820,LTS Nails,540 Eglinton Avenue W,M5N,Toronto,ON,43.703476,-79.414548,"Nail Salons, Beauty & Spas",4.5,"{'Monday': '0:0-0:0', 'Tuesday': '10:0-19:0', ...",...,10:0,19:0,10:0,19:0,10:0,19:0,10:0,19:0,23:0,17:0
20821,Asia Hut,1450 Kingston Rd,L1V 1C1,Pickering,ON,43.841844,-79.083881,"Restaurants, Soup, Chinese, Caribbean",4.5,"{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3...",...,11:0,21:30,11:0,21:30,11:0,22:30,11:0,22:30,16:0,21:30


In [15]:
# Create categories df sorted by restaurant_id
category = restaurant['Categories'].str.split(',', expand=True)
category['Restaurant_id']=category.index
category = pd.melt(category, id_vars='Restaurant_id', value_name='Category').drop(columns='variable').sort_values('Restaurant_id')

In [16]:
# Eliminating None categories
category = category.loc[~category['Category'].isin([None])].reset_index(drop=True)
category['Category'] = category['Category'].str.strip()

In [20]:
# Read csv with list of yelp categories
categories = pd.read_csv(yelpCategories)
categories['Category'] = categories['Category'].str.strip()

In [21]:
# Take only the categories from the list of yelp categories
categoryFiltered = category.loc[category['Category'].isin(categories['Category'])]
categoryFiltered.tail()

Unnamed: 0,Restaurant_id,Category
87339,20818,Indian
87342,20819,Thai
87346,20821,Soup
87347,20821,Chinese
87348,20821,Caribbean


In [None]:
# categoryFiltered.shape

In [None]:
# category['Category'].isin(['Food'])

In [None]:
# categoryFiltered = []
# for row in category['Category']:
#     print(row)