<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/baseline_business_attributes_revised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, classification_report, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
filename = "yelp_academic_dataset_business" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df_business = pd.concat(chunks, ignore_index=True)
df_business = df_business[df_business['state'] == 'FL']
df_business = df_business[df_business['categories'].str.contains('restaurants', case=False, na=False)]
df_business.shape

(8731, 14)

In [81]:
filename = "US_Cities" + ".txt"
directory = '/content/drive/My Drive/Capstone Data Collection/'
path = directory + filename

column_names = [
    "country_code", "zip_code", "city", "state", "state_abbreviation",
    "county", "county_code", "admin2", "admin3", "latitude", "longitude", "accuracy"
]

df_zip = pd.read_csv(path, sep="\t", header=None, names=column_names)
df_zip = df_zip[df_zip['state_abbreviation'] == 'FL']
df_zip = df_zip[['zip_code', 'city']]
df_zip.shape

(1473, 2)

In [82]:
df_business.isnull().sum()

Unnamed: 0,0
business_id,0
name,0
address,96
city,0
state,0
postal_code,4
latitude,0
longitude,0
stars,0
review_count,0


In [83]:
df_business.dropna(subset=['postal_code'], inplace=True)
df_business.shape

(8727, 14)

In [84]:
df_business['postal_code'] = df_business['postal_code'].astype(int)

In [85]:
df_business = pd.merge(df_business, df_zip, how='left', left_on='postal_code', right_on='zip_code', suffixes=('_original', '_updated'))
df_business.shape

(8727, 16)

In [86]:
filename = "FL_Reviews_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df_reviews = pd.concat(chunks, ignore_index=True)
df_reviews.shape

(792133, 24)

In [87]:
city_group = df_reviews.groupby('business_id').agg(
    city=('city_updated', 'first'),
    avg_star_reviews=('stars_reviews', 'mean')
).reset_index()

city_group = city_group.sort_values(
    by=['avg_star_reviews', 'city'],
    ascending=[False, False]
)
city_group.shape

(8731, 3)

In [88]:
city_group = city_group.dropna()
city_group_subset = city_group[['business_id', 'avg_star_reviews']]

merged_df = pd.merge(df_business, city_group_subset, how='left', on='business_id')
merged_df.shape

(8727, 17)

In [89]:
df = merged_df.drop(['business_id', 'name', 'address', 'city_original', 'latitude', 'longitude', 'zip_code','postal_code', 'hours', 'state', 'is_open'], axis=1)
df.head()

Unnamed: 0,stars,review_count,attributes,categories,city_updated,avg_star_reviews
0,4.0,10,"{'Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks",Tampa,4.090909
1,4.5,100,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants",Largo,4.386792
2,4.0,23,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Italian",Tampa,3.84
3,4.0,35,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Pizza",Tampa,4.162162
4,4.5,95,"{'BestNights': ""{'monday': False, 'tuesday': F...","Burgers, Sports Bars, Bars, Lounges, Restauran...",Wesley Chapel,4.505051


In [90]:
df.dropna(subset=['city_updated', 'avg_star_reviews'], inplace=True)
df.shape

(8723, 6)

In [91]:
# df['attributes'] = df['attributes'].fillna('N/A')

In [92]:
df.isnull().sum()

Unnamed: 0,0
stars,0
review_count,0
attributes,100
categories,0
city_updated,0
avg_star_reviews,0


In [93]:
import ast

df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

attributes_expanded = pd.json_normalize(df['attributes'])


nested_columns = ['BestNights', 'Ambience', 'Music', 'GoodForMeal', 'BusinessParking', 'DietaryRestrictions']


for column in nested_columns:
    if column in attributes_expanded.columns:
        attributes_expanded[column] = attributes_expanded[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

for column in nested_columns:
    if column in attributes_expanded.columns:
        nested_df = pd.json_normalize(attributes_expanded[column]).add_prefix(f"{column}_")
        attributes_expanded = attributes_expanded.drop(columns=[column]).join(nested_df)


columns_to_clean = ['Alcohol', 'WiFi', 'NoiseLevel', 'Smoking', 'RestaurantsAttire', 'BYOBCorkage', 'AgesAllowed']

for column in columns_to_clean:
    attributes_expanded[column] = attributes_expanded[column].apply(lambda x: x.strip("u'") if isinstance(x, str) else x)


attributes_expanded['Alcohol'] = attributes_expanded['Alcohol'].replace('none', 'None')
attributes_expanded['Alcohol'].value_counts()

Unnamed: 0_level_0,count
Alcohol,Unnamed: 1_level_1
,2772
full_bar,2022
beer_and_wine,1785


In [94]:
attributes_expanded_subset = attributes_expanded.dropna(thresh=len(attributes_expanded) * 0.5, axis=1)

attributes_expanded_subset.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
GoodForMeal_breakfast,4272
GoodForMeal_lunch,4141
GoodForMeal_dinner,4018
NoiseLevel,3027
Ambience_trendy,2855
Caters,2852
Ambience_hipster,2665
Ambience_divey,2650
BikeParking,2637
Ambience_intimate,2596


In [95]:
for i in attributes_expanded_subset:
  print(attributes_expanded_subset[i].value_counts())

Alcohol
None             2772
full_bar         2022
beer_and_wine    1785
Name: count, dtype: int64
OutdoorSeating
True     3881
False    3076
None      341
Name: count, dtype: int64
RestaurantsReservations
False    4765
True     2246
None       52
Name: count, dtype: int64
RestaurantsGoodForGroups
True     5828
False     900
Name: count, dtype: int64
WiFi
free    3885
no      2463
paid      48
None       9
Name: count, dtype: int64
RestaurantsPriceRange2
2       3663
1       3367
3        169
4         17
None       1
Name: count, dtype: int64
RestaurantsDelivery
True     4620
False    2790
None      524
Name: count, dtype: int64
RestaurantsAttire
casual    6140
dressy      92
formal       9
None         6
Name: count, dtype: int64
BusinessAcceptsCreditCards
True     7755
False     113
None        5
Name: count, dtype: int64
RestaurantsTakeOut
True     7541
False     308
None      247
Name: count, dtype: int64
Caters
True     3761
False    2107
None        3
Name: count, dtype: int64


In [96]:
attributes_dummies = pd.get_dummies(attributes_expanded_subset)

In [97]:
attributes_dummies.head()

Unnamed: 0,Alcohol_None,Alcohol_beer_and_wine,Alcohol_full_bar,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,RestaurantsReservations_False,RestaurantsReservations_None,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,WiFi_None,WiFi_free,WiFi_no,WiFi_paid,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,RestaurantsAttire_None,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,RestaurantsTakeOut_False,RestaurantsTakeOut_None,RestaurantsTakeOut_True,Caters_False,Caters_None,Caters_True,NoiseLevel_None,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,GoodForKids_False,GoodForKids_None,GoodForKids_True,BikeParking_False,BikeParking_None,BikeParking_True,HasTV_False,HasTV_None,HasTV_True,Ambience_touristy_False,Ambience_touristy_True,Ambience_hipster_False,Ambience_hipster_True,Ambience_romantic_False,Ambience_romantic_True,Ambience_divey_False,Ambience_divey_True,Ambience_intimate_False,Ambience_intimate_True,Ambience_trendy_False,Ambience_trendy_True,Ambience_upscale_False,Ambience_upscale_True,Ambience_classy_False,Ambience_classy_True,Ambience_casual_False,Ambience_casual_True,GoodForMeal_lunch_False,GoodForMeal_lunch_True,GoodForMeal_dinner_False,GoodForMeal_dinner_True,GoodForMeal_breakfast_False,GoodForMeal_breakfast_True,BusinessParking_garage_False,BusinessParking_garage_True,BusinessParking_street_False,BusinessParking_street_True,BusinessParking_validated_False,BusinessParking_validated_True,BusinessParking_lot_False,BusinessParking_lot_True,BusinessParking_valet_False,BusinessParking_valet_True
0,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False
1,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,False,True,True,False,True,False,True,False,True,False,True,False,False,True,True,False
2,False,False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True,True,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,False,False,False,False,False,False,True,False,True,False,True,False,False,True,True,False
3,False,True,False,True,False,False,True,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,False,True,False,True,True,False,True,False,True,False,True,False,False,True,True,False
4,False,False,True,True,False,False,False,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False


In [98]:
attributes_dummies.shape

(8723, 84)

In [101]:
df.reset_index(drop=True, inplace=True)
attributes_dummies.reset_index(drop=True, inplace=True)

df = pd.concat([df.drop(columns=['attributes']), attributes_dummies], axis=1)
df.shape

(8723, 89)

In [102]:
category_counts = df['categories'].str.split(', ').explode().value_counts()

threshold = len(category_counts) * 0.5
high_count_categories = category_counts[category_counts >= threshold].index

categories_expanded = df['categories'].str.get_dummies(sep=', ')
categories_expanded_subset = categories_expanded[high_count_categories]

categories_expanded_subset.shape

(8723, 36)

In [103]:
categories_expanded_subset.columns

Index(['Restaurants', 'Food', 'Nightlife', 'American (Traditional)', 'Bars',
       'Sandwiches', 'Breakfast & Brunch', 'Fast Food', 'Pizza',
       'American (New)', 'Burgers', 'Seafood', 'Italian', 'Mexican',
       'Coffee & Tea', 'Chicken Wings', 'Salad', 'Cafes',
       'Event Planning & Services', 'Chinese', 'Sports Bars', 'Sushi Bars',
       'Delis', 'Desserts', 'Specialty Food', 'Barbeque', 'Caterers',
       'Bakeries', 'Steakhouses', 'Japanese', 'Latin American', 'Food Trucks',
       'Asian Fusion', 'Diners', 'Juice Bars & Smoothies', 'Greek'],
      dtype='object')

In [104]:
df = pd.concat([df, categories_expanded_subset], axis=1).drop('categories', axis=1)
df.shape

(8723, 124)

In [105]:
df = pd.concat([df, pd.get_dummies(df['city_updated'])], axis=1).drop('city_updated', axis=1)
df.shape

(8723, 166)

In [106]:
pd.set_option('display.max_columns', None)

df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)
df.head()

  df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)


Unnamed: 0,stars,review_count,avg_star_reviews,Alcohol_None,Alcohol_beer_and_wine,Alcohol_full_bar,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,RestaurantsReservations_False,RestaurantsReservations_None,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,WiFi_None,WiFi_free,WiFi_no,WiFi_paid,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,RestaurantsAttire_None,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,RestaurantsTakeOut_False,RestaurantsTakeOut_None,RestaurantsTakeOut_True,Caters_False,Caters_None,Caters_True,NoiseLevel_None,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,GoodForKids_False,GoodForKids_None,GoodForKids_True,BikeParking_False,BikeParking_None,BikeParking_True,HasTV_False,HasTV_None,HasTV_True,Ambience_touristy_False,Ambience_touristy_True,Ambience_hipster_False,Ambience_hipster_True,Ambience_romantic_False,Ambience_romantic_True,Ambience_divey_False,Ambience_divey_True,Ambience_intimate_False,Ambience_intimate_True,Ambience_trendy_False,Ambience_trendy_True,Ambience_upscale_False,Ambience_upscale_True,Ambience_classy_False,Ambience_classy_True,Ambience_casual_False,Ambience_casual_True,GoodForMeal_lunch_False,GoodForMeal_lunch_True,GoodForMeal_dinner_False,GoodForMeal_dinner_True,GoodForMeal_breakfast_False,GoodForMeal_breakfast_True,BusinessParking_garage_False,BusinessParking_garage_True,BusinessParking_street_False,BusinessParking_street_True,BusinessParking_validated_False,BusinessParking_validated_True,BusinessParking_lot_False,BusinessParking_lot_True,BusinessParking_valet_False,BusinessParking_valet_True,Restaurants,Food,Nightlife,American (Traditional),Bars,Sandwiches,Breakfast & Brunch,Fast Food,Pizza,American (New),Burgers,Seafood,Italian,Mexican,Coffee & Tea,Chicken Wings,Salad,Cafes,Event Planning & Services,Chinese,Sports Bars,Sushi Bars,Delis,Desserts,Specialty Food,Barbeque,Caterers,Bakeries,Steakhouses,Japanese,Latin American,Food Trucks,Asian Fusion,Diners,Juice Bars & Smoothies,Greek,Apollo Beach,Balm,Brandon,Brooksville,Clearwater,Clearwater Beach,Dade City,Dover,Dunedin,Gibsonton,Holiday,Hudson,Indian Rocks Beach,Land O Lakes,Largo,Lithia,Lutz,New Port Richey,Odessa,Oldsmar,Ozona,Palm Harbor,Palmetto,Pinellas Park,Plant City,Port Richey,Riverview,Ruskin,Safety Harbor,Saint Leo,Saint Petersburg,San Antonio,Seffner,Seminole,Spring Hill,Sun City Center,Tampa,Tarpon Springs,Thonotosassa,Valrico,Wesley Chapel,Wimauma,Zephyrhills
0,4.0,10,4.090909,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,4.5,100,4.386792,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4.0,23,3.84,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4.0,35,4.162162,0,1,0,1,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0,1,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,4.5,95,4.505051,0,0,1,1,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [109]:
df2 = df.copy()

In [110]:
output_filename = "FL_Restaurants_Business Attributes_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

out_path = directory + output_filename

df.to_csv(out_path, index=False)

In [111]:
X = df.drop('stars', axis=1)
y = df['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [112]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[-0.1032865 , -1.66457181, -0.67923579, ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.51570892,  1.75134226,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.4094789 , -1.84470503,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       ...,
       [-0.11578415,  0.53426508, -0.67923579, ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.37823478, -0.07906987,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.50321127, -2.83380973,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098]])

Baseline Model - Regression

In [113]:
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)

baseline_preds = regression_model.predict(X_test_scaled)
valid_ratings = np.array([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
rounded_preds = np.round(baseline_preds * 2) / 2

adjusted_preds = np.array([min(valid_ratings, key=lambda x: abs(x - pred)) for pred in rounded_preds])

y_test_classes = (y_test * 2).astype(int)
adjusted_preds_classes = (adjusted_preds * 2).astype(int)

baseline_accuracy = accuracy_score(y_test_classes, adjusted_preds_classes)
print("Baseline Model - Linear Regression (Rounded to Valid Ratings)")
print(f"Accuracy: {baseline_accuracy:.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, adjusted_preds):.2f}")
print(f"R-squared: {r2_score(y_test, adjusted_preds):.2f}")


Baseline Model - Linear Regression (Rounded to Valid Ratings)
Accuracy: 0.93
Mean Squared Error: 0.02
R-squared: 0.97


In [114]:
X = df2.drop('stars', axis=1)
y = df2['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[-0.1032865 , -1.66457181, -0.67923579, ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.51570892,  1.75134226,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.4094789 , -1.84470503,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       ...,
       [-0.11578415,  0.53426508, -0.67923579, ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.37823478, -0.07906987,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098],
       [-0.50321127, -2.83380973,  1.4722428 , ..., -0.13885724,
        -0.02677782, -0.08146098]])

Baseline - Classifier

In [115]:
y_train_class = (y_train * 2).astype(int)
y_test_class = (y_test * 2).astype(int)

classifier_model = RandomForestClassifier()
classifier_model.fit(X_train_scaled, y_train_class)

class_preds = classifier_model.predict(X_test_scaled)
class_accuracy = accuracy_score(y_test_class, class_preds)
print("Classification Model - Random Forest")
print(f"Accuracy: {class_accuracy:.2f}")
print(classification_report(y_test_class, class_preds))

Classification Model - Random Forest
Accuracy: 0.83
              precision    recall  f1-score   support

           2       1.00      0.12      0.22         8
           3       0.81      0.55      0.65        53
           4       0.67      0.59      0.63        76
           5       0.72      0.57      0.64       178
           6       0.71      0.76      0.73       216
           7       0.84      0.93      0.88       385
           8       0.90      0.96      0.93       445
           9       0.86      0.92      0.89       321
          10       1.00      0.33      0.50        63

    accuracy                           0.83      1745
   macro avg       0.83      0.64      0.67      1745
weighted avg       0.83      0.83      0.82      1745

