<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/hyperparameter_tuning_business_attributes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [145]:
filename = "FL_Restaurants_Business Attributes" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.shape

(8721, 16)

In [146]:
df = df.drop(['business_id', 'name', 'address', 'city_original', 'latitude', 'longitude', 'zip_code', 'hours', 'state'], axis=1)
df.head()

Unnamed: 0,postal_code,stars,review_count,is_open,attributes,categories,city_updated
0,33602,4.0,10,1,"{'Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks",Tampa
1,33771,4.5,100,0,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants",Largo
2,33618,4.0,23,0,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Italian",Tampa
3,33607,4.0,35,0,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Pizza",Tampa
4,33544,4.5,95,1,"{'BestNights': ""{'monday': False, 'tuesday': F...","Burgers, Sports Bars, Bars, Lounges, Restauran...",Wesley Chapel


In [147]:
import ast

df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

attributes_expanded = pd.json_normalize(df['attributes'])



nested_columns = ['BestNights', 'Ambience', 'Music', 'GoodForMeal', 'BusinessParking', 'DietaryRestrictions']


for column in nested_columns:
    if column in attributes_expanded.columns:
        attributes_expanded[column] = attributes_expanded[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

for column in nested_columns:
    if column in attributes_expanded.columns:
        nested_df = pd.json_normalize(attributes_expanded[column]).add_prefix(f"{column}_")
        attributes_expanded = attributes_expanded.drop(columns=[column]).join(nested_df)


columns_to_clean = ['Alcohol', 'WiFi', 'NoiseLevel', 'Smoking', 'RestaurantsAttire', 'BYOBCorkage', 'AgesAllowed']

for column in columns_to_clean:
    attributes_expanded[column] = attributes_expanded[column].apply(lambda x: x.strip("u'") if isinstance(x, str) else x)


In [148]:
pd.set_option('display.max_rows', None)

attributes_expanded.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
Open24Hours,8720
RestaurantsCounterService,8720
DietaryRestrictions_vegetarian,8719
DietaryRestrictions_soy-free,8719
DietaryRestrictions_halal,8719
DietaryRestrictions_kosher,8719
DietaryRestrictions_vegan,8719
DietaryRestrictions_gluten-free,8719
DietaryRestrictions_dairy-free,8719
AcceptsInsurance,8718


In [149]:
attributes_expanded_subset = attributes_expanded.dropna(thresh=len(attributes_expanded) * 0.25, axis=1)

attributes_expanded_subset.isnull().sum().sort_values(ascending=False)


Unnamed: 0,0
DogsAllowed,6505
HappyHour,6261
WheelchairAccessible,6148
RestaurantsTableService,4969
GoodForMeal_dessert,4765
GoodForMeal_latenight,4555
GoodForMeal_brunch,4496
GoodForMeal_breakfast,4270
GoodForMeal_lunch,4139
GoodForMeal_dinner,4016


In [150]:
attributes_expanded_subset['Alcohol'].value_counts()

Unnamed: 0_level_0,count
Alcohol,Unnamed: 1_level_1
none,2769
full_bar,2022
beer_and_wine,1785
,3


In [151]:
attributes_expanded_subset['Alcohol'] = attributes_expanded_subset['Alcohol'].replace('none', 'None')

attributes_expanded_subset['Alcohol'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attributes_expanded_subset['Alcohol'] = attributes_expanded_subset['Alcohol'].replace('none', 'None')


Unnamed: 0_level_0,count
Alcohol,Unnamed: 1_level_1
,2772
full_bar,2022
beer_and_wine,1785


In [152]:
for i in attributes_expanded_subset:
  print(attributes_expanded_subset[i].value_counts())

Alcohol
None             2772
full_bar         2022
beer_and_wine    1785
Name: count, dtype: int64
OutdoorSeating
True     3881
False    3076
None      341
Name: count, dtype: int64
RestaurantsReservations
False    4765
True     2246
None       52
Name: count, dtype: int64
RestaurantsGoodForGroups
True     5828
False     900
Name: count, dtype: int64
WiFi
free    3885
no      2462
paid      48
None       9
Name: count, dtype: int64
RestaurantsPriceRange2
2       3663
1       3366
3        169
4         17
None       1
Name: count, dtype: int64
RestaurantsDelivery
True     4620
False    2790
None      524
Name: count, dtype: int64
RestaurantsAttire
casual    6140
dressy      92
formal       9
None         6
Name: count, dtype: int64
BusinessAcceptsCreditCards
True     7754
False     113
None        5
Name: count, dtype: int64
RestaurantsTakeOut
True     7541
False     308
None      247
Name: count, dtype: int64
Caters
True     3761
False    2107
None        3
Name: count, dtype: int64


In [153]:
attributes_dummies = pd.get_dummies(attributes_expanded_subset)

df = pd.concat([df.drop(columns=['attributes']), attributes_dummies], axis=1)
df.shape

(8721, 107)

In [154]:
category_counts = df['categories'].str.split(', ').explode().value_counts()

threshold = len(category_counts) * 0.25
high_count_categories = category_counts[category_counts >= threshold].index

categories_expanded = df['categories'].str.get_dummies(sep=', ')
categories_expanded_subset = categories_expanded[high_count_categories]

categories_expanded_subset.head()

Unnamed: 0,Restaurants,Food,Nightlife,American (Traditional),Bars,Sandwiches,Breakfast & Brunch,Fast Food,Pizza,American (New),...,Chicken Shop,Southern,Food Delivery Services,Vegan,Hot Dogs,Gluten-Free,Beer Bar,Comfort Food,Grocery,Spanish
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [155]:
df = pd.concat([df, categories_expanded_subset], axis=1).drop('categories', axis=1)
df.shape

(8721, 167)

In [156]:
df = pd.concat([df, pd.get_dummies(df['city_updated'])], axis=1).drop('city_updated', axis=1)
df.shape

(8721, 209)

In [157]:
df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)

  df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)


In [158]:
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,postal_code,stars,review_count,is_open,Alcohol_None,Alcohol_beer_and_wine,Alcohol_full_bar,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,RestaurantsReservations_False,RestaurantsReservations_None,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,WiFi_None,WiFi_free,WiFi_no,WiFi_paid,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,RestaurantsAttire_None,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,RestaurantsTakeOut_False,RestaurantsTakeOut_None,RestaurantsTakeOut_True,Caters_False,Caters_None,Caters_True,NoiseLevel_None,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,GoodForKids_False,GoodForKids_None,GoodForKids_True,BikeParking_False,BikeParking_None,BikeParking_True,RestaurantsTableService_False,RestaurantsTableService_None,RestaurantsTableService_True,HasTV_False,HasTV_None,HasTV_True,WheelchairAccessible_False,WheelchairAccessible_None,WheelchairAccessible_True,HappyHour_False,HappyHour_True,DogsAllowed_False,DogsAllowed_None,DogsAllowed_True,Ambience_touristy_False,Ambience_touristy_True,Ambience_hipster_False,Ambience_hipster_True,Ambience_romantic_False,Ambience_romantic_True,Ambience_divey_False,Ambience_divey_True,Ambience_intimate_False,Ambience_intimate_True,Ambience_trendy_False,Ambience_trendy_True,Ambience_upscale_False,Ambience_upscale_True,Ambience_classy_False,Ambience_classy_True,Ambience_casual_False,Ambience_casual_True,GoodForMeal_dessert_False,GoodForMeal_dessert_True,GoodForMeal_latenight_False,GoodForMeal_latenight_True,GoodForMeal_lunch_False,GoodForMeal_lunch_True,GoodForMeal_dinner_False,GoodForMeal_dinner_True,GoodForMeal_brunch_False,GoodForMeal_brunch_True,GoodForMeal_breakfast_False,GoodForMeal_breakfast_True,BusinessParking_garage_False,BusinessParking_garage_True,BusinessParking_street_False,BusinessParking_street_True,BusinessParking_validated_False,BusinessParking_validated_True,BusinessParking_lot_False,BusinessParking_lot_True,BusinessParking_valet_False,BusinessParking_valet_True,Restaurants,Food,Nightlife,American (Traditional),Bars,Sandwiches,Breakfast & Brunch,Fast Food,Pizza,American (New),Burgers,Seafood,Italian,Mexican,Coffee & Tea,Chicken Wings,Salad,Cafes,Event Planning & Services,Chinese,Sports Bars,Sushi Bars,Delis,Desserts,Specialty Food,Barbeque,Caterers,Bakeries,Steakhouses,Japanese,Latin American,Food Trucks,Asian Fusion,Diners,Juice Bars & Smoothies,Greek,Cocktail Bars,Mediterranean,Pubs,Wine & Spirits,Beer,Cuban,Tacos,Thai,Caribbean,Arts & Entertainment,Soup,Tex-Mex,Ice Cream & Frozen Yogurt,Wine Bars,Vegetarian,Chicken Shop,Southern,Food Delivery Services,Vegan,Hot Dogs,Gluten-Free,Beer Bar,Comfort Food,Grocery,Spanish,Apollo Beach,Balm,Brandon,Brooksville,Clearwater,Clearwater Beach,Dade City,Dover,Dunedin,Gibsonton,Holiday,Hudson,Indian Rocks Beach,Land O Lakes,Largo,Lithia,Lutz,New Port Richey,Odessa,Oldsmar,Ozona,Palm Harbor,Palmetto,Pinellas Park,Plant City,Port Richey,Riverview,Ruskin,Safety Harbor,Saint Leo,Saint Petersburg,San Antonio,Seffner,Seminole,Spring Hill,Sun City Center,Tampa,Tarpon Springs,Thonotosassa,Valrico,Wesley Chapel,Wimauma,Zephyrhills
0,33602,4.0,10,1,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,33771,4.5,100,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,33618,4.0,23,0,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,33607,4.0,35,0,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,33544,4.5,95,1,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [159]:
rating_mapping = {1: 0, 1.5: 1, 2: 2, 2.5: 3, 3: 4, 3.5: 5, 4: 6, 4.5: 7, 5: 8}
df['rating_class'] = df['stars'].map(rating_mapping)

X = df.drop(columns=['stars', 'rating_class'])
y = df['rating_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [160]:
X_train_scaled

array([[-0.48371637, -0.46432207, -1.44881575, ..., -0.13671719,
        -0.02933994, -0.08147274],
       [-0.57046369, -0.52852072,  0.69021889, ..., -0.13671719,
        -0.02933994, -0.08147274],
       [-0.49160249, -0.29740558,  0.69021889, ..., -0.13671719,
        -0.02933994, -0.08147274],
       ...,
       [-0.29182076, -0.11764936, -1.44881575, ..., -0.13671719,
        -0.02933994, -0.08147274],
       [-0.12621222, -0.38728369,  0.69021889, ..., -0.13671719,
        -0.02933994, -0.08147274],
       [-0.12095481,  1.21126268,  0.69021889, ..., -0.13671719,
        -0.02933994, -0.08147274]])

In [161]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=9,
    max_depth=10,
    learning_rate=0.1,
    n_estimators=500,
    eval_metric="mlogloss"
)

xgb_model.fit(X_train_scaled, y_train)

In [162]:
y_pred = xgb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.34
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.45      0.29      0.36        51
           2       0.31      0.22      0.26        83
           3       0.33      0.25      0.29       176
           4       0.22      0.21      0.21       215
           5       0.34      0.33      0.34       395
           6       0.35      0.49      0.41       438
           7       0.41      0.39      0.40       314
           8       0.20      0.11      0.14        64

    accuracy                           0.34      1745
   macro avg       0.29      0.25      0.27      1745
weighted avg       0.34      0.34      0.33      1745

