# Data Analysis: Businesses
## Analysis of the businesses dataset
2 fields are problematic: 'attributes' and 'categories':
* Attributes are unstructured data about a venue.
* Not all businesses are restaurants.

In [2]:
from pathlib import Path
from src.data.data_reader import DataReader
import os
import pandas as pd
import json

while str(os.getcwd())[-3:] != 'src':  # Execute from src-directory root
    os.chdir('..')

In [3]:
# Get businesses dataframe as presented in the data set
entries = DataReader._get_entries_from_file(Path('..', 'data', DataReader.EXPECTED_FILES[0]))
filtered_business_fields = [
    'business_id',
    'name',
    'city',
    'stars',
    'review_count',
    'attributes',  # Filtered in _parse_categories()
    'categories'  # Filtered in _parse_categories()
]
filtered_entries = DataReader._filter_entries(entries, filtered_business_fields)
businesses = pd.DataFrame.from_records(filtered_entries)
businesses

Unnamed: 0,business_id,name,city,stars,review_count,attributes,categories
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop..."
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton,3.0,15,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,3.5,22,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane,4.5,13,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food"
...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,Edmonton,3.0,13,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas"
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,Nashville,4.0,5,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,Indianapolis,3.5,8,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut..."
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,Edwardsville,4.0,24,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician..."


## Data Selection
Let's find all distinct values for categories and create a whitelist of every category directly related to restaurants.

In [4]:
# All possible values in 'categories' column
{category for categories_string in businesses['categories'] if categories_string for category in categories_string.split(", ")}

{'Kitchen & Bath',
 'Archery',
 'Pool Cleaners',
 'Trinidadian',
 'Elementary Schools',
 'Scooter Tours',
 'Barbers',
 'Sports Clubs',
 'Family Practice',
 'Childproofing',
 'Kids Activities',
 'Climbing',
 'Photography Stores & Services',
 'Cheese Shops',
 'Brasseries',
 'Police Departments',
 'Bike Shop',
 'Lawn Services',
 'Installment Loans',
 'Irish Pub',
 'Elder Law',
 'Stucco Services',
 'Internet Cafes',
 'RV Parks',
 'IV Hydration',
 'Music Venues',
 'Vinyl Records',
 'Osteopathic Physicians',
 'Auction Houses',
 'Filipino',
 'Chicken Shop',
 'Spin Classes',
 'Home Energy Auditors',
 'Skiing',
 'Home Developers',
 'House Sitters',
 'Lawyers',
 'Home Automation',
 'Thai',
 'Sandwiches',
 'Austrian',
 'Endocrinologists',
 'Hungarian',
 'Divorce & Family Law',
 'Olive Oil',
 'Pop-Up Restaurants',
 'Adult Education',
 'Departments of Motor Vehicles',
 'Ice Cream & Frozen Yogurt',
 'Hot Pot',
 'Religious Items',
 'Traditional Chinese Medicine',
 'Car Rental',
 'Pool & Hot Tub Servi

In [5]:
# Whitelist attempt 1
# All (manually curated) restaurant-like tags
categories_whitelist_1 = {
    "Food Court",
    "Steakhouses",
    "Brasseries",
    "Gastropubs",
    "Tapas Bars",
    "Diners",
    "Buffets",
    "Food Trucks",
    "Restaurants",
    "Fast Food",
    "Food Stands",
    "Dinner Theater",
}
businesses['categories_whitelist_1'] = [
    set(category_group.split(", "))  # Convert string of all categories to a set of individual categories
    if category_group and set(category_group.split(", ")).intersection(categories_whitelist_1)  # If in whitelist
    else None  # No category is provided by Yelp, or no category is in the whitelist
    for category_group in businesses['categories']
]
businesses_whitelist_1_categories = businesses.loc[businesses['categories_whitelist_1'].notnull()]  # Remove businesses with no categories listed

In [6]:
# Whitelist attempt 2
# Only Food Trucks & Restaurants
categories_whitelist_2 = {
    "Food Trucks",  # Data exploration shows that all restaurant-like businesses
    "Restaurants",  # either have the category "Food Truck" or "Restaurant".
}  # Only keep businesses that contain at least 1 of the categories in this whitelist
businesses['categories_whitelist_2'] = [
    set(category_group.split(", "))  # Convert string of all categories to a set of individual categories
    if category_group and set(category_group.split(", ")).intersection(categories_whitelist_2)  # If in whitelist
    else None  # No category is provided by Yelp, or no category is in the whitelist
    for category_group in businesses['categories']
]
businesses_whitelist_2_categories = businesses.loc[businesses['categories_whitelist_2'].notnull()]  # Remove businesses with no categories listed

In [7]:
print(f"The length of the manually curated list of categories is equal to only checking for 'restaurant' and 'Food Trucks': {len(businesses_whitelist_1_categories)} == {len(businesses_whitelist_2_categories)}.\nThe same rows were selected by each query.")
businesses

The length of the manually curated list of categories is equal to only checking for 'restaurant' and 'Food Trucks': 52533 == 52533.
The same rows were selected by each query.


Unnamed: 0,business_id,name,city,stars,review_count,attributes,categories,categories_whitelist_1,categories_whitelist_2
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,7,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton,3.0,15,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...",,
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,3.5,22,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...",,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{Food, Restaurants, Bakeries, Bubble Tea, Coff...","{Food, Restaurants, Bakeries, Bubble Tea, Coff..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane,4.5,13,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",,
...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,Edmonton,3.0,13,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas",,
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,Nashville,4.0,5,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...",,
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,Indianapolis,3.5,8,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",,
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,Edwardsville,4.0,24,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...",,


We conclude that the categories_whitelist_2 will be used for filtering.
The second problem is the 'attributes' field. It contains a JSON with unstructured subfields. We will now discover which of these fields are useable for a recommender system.
First, we discover all distinct subfields in the 'attribute' field.

In [8]:
# Get set of all possbible attributes
from collections import Counter
all_keys = Counter()
for business_attributes in businesses['attributes']:
    if business_attributes is not None:
        keys = business_attributes.keys()
        all_keys.update(keys)
print(f'There are {len(businesses)} businesses')
all_keys.most_common()  # Sort by most common values

There are 150346 businesses


[('BusinessAcceptsCreditCards', 119765),
 ('BusinessParking', 91085),
 ('RestaurantsPriceRange2', 85314),
 ('BikeParking', 72638),
 ('RestaurantsTakeOut', 59857),
 ('WiFi', 56914),
 ('RestaurantsDelivery', 56282),
 ('GoodForKids', 53375),
 ('OutdoorSeating', 48802),
 ('RestaurantsReservations', 45247),
 ('HasTV', 45084),
 ('Ambience', 44279),
 ('RestaurantsGoodForGroups', 44170),
 ('Alcohol', 43189),
 ('ByAppointmentOnly', 42339),
 ('Caters', 40127),
 ('RestaurantsAttire', 39255),
 ('NoiseLevel', 37993),
 ('GoodForMeal', 29087),
 ('WheelchairAccessible', 28953),
 ('RestaurantsTableService', 19982),
 ('DogsAllowed', 18284),
 ('BusinessAcceptsBitcoin', 17430),
 ('HappyHour', 15171),
 ('DriveThru', 7760),
 ('Music', 7521),
 ('AcceptsInsurance', 5713),
 ('BestNights', 5694),
 ('CoatCheck', 5584),
 ('GoodForDancing', 4628),
 ('Smoking', 4567),
 ('BYOB', 4451),
 ('Corkage', 3553),
 ('BYOBCorkage', 1444),
 ('HairSpecializesIn', 1065),
 ('AgesAllowed', 129),
 ('Open24Hours', 39),
 ('DietaryRes

We now manually select the fields that might be of interest for a recommender system:

In [9]:
businesses_attributes_filtered = []

filtered_attributes_single = {
    'RestaurantsTakeOut',
    'RestaurantsDelivery',
    'RestaurantsPriceRange2',
    'GoodForKids',
    'RestaurantsGoodForGroups',
    'RestaurantsAttire',
    'NoiseLevel'
}
filtered_attributes_multi = {
    'Ambience',
    'GoodForMeal'
}


for business_attributes in businesses['attributes']:
    parsed_business_attributes = {}
    if business_attributes is not None:
        for attribute_key, attribute_value in business_attributes.items():
            if attribute_key in filtered_attributes_multi and attribute_value.startswith('{'):  # Attribute is opnieuw een dict
                attribute_value = attribute_value.replace('\'', '\"')
                # TODO: alle waarden van de dict toevoegen
            elif attribute_key in filtered_attributes_single:
                parsed_business_attributes[attribute_key] = attribute_value
    businesses_attributes_filtered.append(parsed_business_attributes)

businesses['attributes_filtered'] = businesses_attributes_filtered
businesses[['attributes', 'attributes_filtered']]

Unnamed: 0,attributes,attributes_filtered
0,{'ByAppointmentOnly': 'True'},{}
1,{'BusinessAcceptsCreditCards': 'True'},{}
2,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","{'RestaurantsPriceRange2': '2', 'RestaurantsTa..."
3,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","{'RestaurantsDelivery': 'False', 'RestaurantsP..."
4,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","{'RestaurantsTakeOut': 'True', 'GoodForKids': ..."
...,...,...
150341,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...",{'RestaurantsPriceRange2': '3'}
150342,"{'BusinessAcceptsCreditCards': 'True', 'Restau...",{'RestaurantsPriceRange2': '2'}
150343,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...",{'RestaurantsPriceRange2': '1'}
150344,"{'BusinessParking': '{'garage': False, 'street...","{'RestaurantsPriceRange2': '4', 'RestaurantsTa..."


Now we have selected all data to use for businesses. This data now needs to be reformatted to allow for easy input into a neural network.

## Data Transformation
Let's first see what we have at this moment...

In [24]:
businesses = businesses[businesses['categories_whitelist_2'].notnull()]  # Remove those with no categories, as these are not considered 'restaurant-like' and thus fall out-of-scope
businesses

Unnamed: 0,business_id,name,city,stars,review_count,attributes,categories,categories_whitelist_1,categories_whitelist_2,attributes_filtered
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,4.0,80,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{Food, Restaurants, Bakeries, Bubble Tea, Coff...","{Food, Restaurants, Bakeries, Bubble Tea, Coff...","{'RestaurantsDelivery': 'False', 'RestaurantsP..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,Ashland City,2.0,6,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{Burgers, Fast Food, Food, Restaurants, Sandwi...","{Burgers, Fast Food, Food, Restaurants, Sandwi...","{'RestaurantsAttire': 'u'casual'', 'Restaurant..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,Affton,3.0,19,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...","{American (Traditional), Bars, Restaurants, Pu...","{American (Traditional), Bars, Restaurants, Pu...","{'RestaurantsAttire': 'u'casual'', 'Restaurant..."
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,Nashville,1.5,10,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{Burgers, Fast Food, Food, Restaurants, Ice Cr...","{Burgers, Fast Food, Food, Restaurants, Ice Cr...","{'RestaurantsAttire': ''casual'', 'Restaurants..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,Tampa Bay,4.0,10,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{Vietnamese, Food Trucks, Restaurants, Food}","{Vietnamese, Food Trucks, Restaurants, Food}",{}
...,...,...,...,...,...,...,...,...,...,...
150325,l9eLGG9ZKpLJzboZq-9LRQ,Wawa,Clifton Heights,3.0,11,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Restaurants, Sandwiches, Convenience Stores, C...","{Convenience Stores, Food, Restaurants, Coffee...","{Convenience Stores, Food, Restaurants, Coffee...","{'RestaurantsPriceRange2': '1', 'RestaurantsTa..."
150327,cM6V90ExQD6KMSU3rRB5ZA,Dutch Bros Coffee,Boise,4.0,33,"{'WiFi': ''free'', 'RestaurantsGoodForGroups':...","Cafes, Juice Bars & Smoothies, Coffee & Tea, R...","{Cafes, Food, Restaurants, Coffee & Tea, Juice...","{Cafes, Food, Restaurants, Coffee & Tea, Juice...","{'RestaurantsGoodForGroups': 'True', 'Restaura..."
150336,WnT9NIzQgLlILjPT0kEcsQ,Adelita Taqueria & Restaurant,Philadelphia,4.5,35,"{'WheelchairAccessible': 'False', 'Restaurants...","Restaurants, Mexican","{Mexican, Restaurants}","{Mexican, Restaurants}","{'GoodForKids': 'True', 'RestaurantsAttire': '..."
150339,2O2K6SXPWv56amqxCECd4w,The Plum Pit,Aston,4.5,14,"{'RestaurantsDelivery': 'False', 'BusinessAcce...","Restaurants, Comfort Food, Food, Food Trucks, ...","{Food Trucks, Comfort Food, Caterers, Food, Ev...","{Food Trucks, Comfort Food, Caterers, Food, Ev...","{'RestaurantsDelivery': 'False', 'RestaurantsP..."


In [33]:
all_remaining_categories = (category for business_categories in businesses['categories_whitelist_2'] for category in business_categories)
categories_appearances = Counter(all_remaining_categories)
categories_appearances.most_common()

[('Restaurants', 52268),
 ('Food', 15737),
 ('Nightlife', 8730),
 ('Sandwiches', 8366),
 ('Bars', 8342),
 ('American (Traditional)', 8139),
 ('Pizza', 7093),
 ('Fast Food', 6472),
 ('Breakfast & Brunch', 6239),
 ('American (New)', 6097),
 ('Burgers', 5636),
 ('Mexican', 4600),
 ('Italian', 4573),
 ('Coffee & Tea', 4075),
 ('Seafood', 3539),
 ('Chinese', 3169),
 ('Event Planning & Services', 3110),
 ('Salad', 3064),
 ('Chicken Wings', 2966),
 ('Cafes', 2756),
 ('Delis', 2393),
 ('Caterers', 2099),
 ('Specialty Food', 2030),
 ('Bakeries', 1906),
 ('Desserts', 1874),
 ('Japanese', 1830),
 ('Sports Bars', 1797),
 ('Sushi Bars', 1717),
 ('Barbeque', 1694),
 ('Asian Fusion', 1547),
 ('Steakhouses', 1506),
 ('Diners', 1494),
 ('Cocktail Bars', 1405),
 ('Pubs', 1397),
 ('Food Trucks', 1273),
 ('Mediterranean', 1263),
 ('Wine & Spirits', 1158),
 ('Beer', 1158),
 ('Vegetarian', 1158),
 ('Ice Cream & Frozen Yogurt', 1113),
 ('Arts & Entertainment', 1094),
 ('Soup', 1061),
 ('Juice Bars & Smoothie

In [36]:
businesses['categories_whitelist_2_most_common'] = [{category for category in business_categories if categories_appearances[category] >= 500} for business_categories in businesses['categories_whitelist_2']]
all_remaining_categories = (category for business_categories in businesses['categories_whitelist_2_most_common'] for category in business_categories)
categories_appearances = Counter(all_remaining_categories)
print("The following categories will be one-hot encoded:")
categories_appearances.most_common()

The following categories will be one-hot encoded:


[('Restaurants', 52268),
 ('Food', 15737),
 ('Nightlife', 8730),
 ('Sandwiches', 8366),
 ('Bars', 8342),
 ('American (Traditional)', 8139),
 ('Pizza', 7093),
 ('Fast Food', 6472),
 ('Breakfast & Brunch', 6239),
 ('American (New)', 6097),
 ('Burgers', 5636),
 ('Mexican', 4600),
 ('Italian', 4573),
 ('Coffee & Tea', 4075),
 ('Seafood', 3539),
 ('Chinese', 3169),
 ('Event Planning & Services', 3110),
 ('Salad', 3064),
 ('Chicken Wings', 2966),
 ('Cafes', 2756),
 ('Delis', 2393),
 ('Caterers', 2099),
 ('Specialty Food', 2030),
 ('Bakeries', 1906),
 ('Desserts', 1874),
 ('Japanese', 1830),
 ('Sports Bars', 1797),
 ('Sushi Bars', 1717),
 ('Barbeque', 1694),
 ('Asian Fusion', 1547),
 ('Steakhouses', 1506),
 ('Diners', 1494),
 ('Cocktail Bars', 1405),
 ('Pubs', 1397),
 ('Food Trucks', 1273),
 ('Mediterranean', 1263),
 ('Wine & Spirits', 1158),
 ('Beer', 1158),
 ('Vegetarian', 1158),
 ('Ice Cream & Frozen Yogurt', 1113),
 ('Arts & Entertainment', 1094),
 ('Soup', 1061),
 ('Juice Bars & Smoothie