<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/baseline_business_attributes_revised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, classification_report, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [66]:
filename = "FL_Restaurants_Business Attributes" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df_business = pd.concat(chunks, ignore_index=True)
df_business.shape

(8721, 16)

In [67]:
filename = "FL_Reviews_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df_reviews = pd.concat(chunks, ignore_index=True)
df_reviews.shape

(792133, 24)

In [68]:
city_group = df_reviews.groupby('business_id').agg(
    city=('city_updated', 'first'),
    avg_star_reviews=('stars_reviews', 'mean')
).reset_index()

city_group = city_group.sort_values(
    by=['avg_star_reviews', 'city'],
    ascending=[False, False]
)
city_group.shape

(8731, 3)

In [69]:
city_group = city_group.dropna()
city_group_subset = city_group[['business_id', 'avg_star_reviews']]

merged_df = pd.merge(df_business, city_group_subset, how='left', on='business_id')
merged_df.shape

(8721, 17)

In [70]:
merged_df = merged_df.dropna()
merged_df.shape

(7584, 17)

In [73]:
df = merged_df.drop(['business_id', 'name', 'address', 'city_original', 'latitude', 'longitude', 'zip_code','postal_code', 'hours', 'state', 'is_open'], axis=1)
df.head()

Unnamed: 0,stars,review_count,attributes,categories,city_updated,avg_star_reviews
0,4.0,10,"{'Alcohol': ""'none'"", 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks",Tampa,4.090909
1,4.5,100,"{'OutdoorSeating': 'False', 'RestaurantsGoodFo...","Food, Delis, Italian, Bakeries, Restaurants",Largo,4.386792
2,4.0,23,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Italian",Tampa,3.84
3,4.0,35,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, Pizza",Tampa,4.162162
4,4.5,95,"{'BestNights': ""{'monday': False, 'tuesday': F...","Burgers, Sports Bars, Bars, Lounges, Restauran...",Wesley Chapel,4.505051


In [74]:
df.isnull().sum()

Unnamed: 0,0
stars,0
review_count,0
attributes,0
categories,0
city_updated,0
avg_star_reviews,0


In [75]:
import ast

df['attributes'] = df['attributes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

attributes_expanded = pd.json_normalize(df['attributes'])


nested_columns = ['BestNights', 'Ambience', 'Music', 'GoodForMeal', 'BusinessParking', 'DietaryRestrictions']


for column in nested_columns:
    if column in attributes_expanded.columns:
        attributes_expanded[column] = attributes_expanded[column].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

for column in nested_columns:
    if column in attributes_expanded.columns:
        nested_df = pd.json_normalize(attributes_expanded[column]).add_prefix(f"{column}_")
        attributes_expanded = attributes_expanded.drop(columns=[column]).join(nested_df)


columns_to_clean = ['Alcohol', 'WiFi', 'NoiseLevel', 'Smoking', 'RestaurantsAttire', 'BYOBCorkage', 'AgesAllowed']

for column in columns_to_clean:
    attributes_expanded[column] = attributes_expanded[column].apply(lambda x: x.strip("u'") if isinstance(x, str) else x)


attributes_expanded['Alcohol'] = attributes_expanded['Alcohol'].replace('none', 'None')
attributes_expanded['Alcohol'].value_counts()

Unnamed: 0_level_0,count
Alcohol,Unnamed: 1_level_1
,2496
full_bar,1801
beer_and_wine,1619


In [76]:
attributes_expanded_subset = attributes_expanded.dropna(thresh=len(attributes_expanded) * 0.5, axis=1)

attributes_expanded_subset.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
GoodForMeal_latenight,3718
GoodForMeal_brunch,3664
GoodForMeal_breakfast,3441
GoodForMeal_lunch,3319
GoodForMeal_dinner,3198
NoiseLevel,2491
Ambience_trendy,2398
Ambience_hipster,2216
Ambience_divey,2200
Ambience_intimate,2148


In [77]:
for i in attributes_expanded_subset:
  print(attributes_expanded_subset[i].value_counts())

Alcohol
None             2496
full_bar         1801
beer_and_wine    1619
Name: count, dtype: int64
OutdoorSeating
True     3520
False    2674
None      325
Name: count, dtype: int64
RestaurantsReservations
False    4203
True     2057
None       50
Name: count, dtype: int64
RestaurantsGoodForGroups
True     5173
False     756
Name: count, dtype: int64
WiFi
free    3678
no      2142
paid      41
None       8
Name: count, dtype: int64
RestaurantsPriceRange2
2       3270
1       2930
3        144
4         10
None       1
Name: count, dtype: int64
RestaurantsDelivery
True     4319
False    2296
None      490
Name: count, dtype: int64
RestaurantsAttire
casual    5407
dressy      73
formal       6
None         5
Name: count, dtype: int64
BusinessAcceptsCreditCards
True     6906
False      75
None        5
Name: count, dtype: int64
RestaurantsTakeOut
True     6734
False     253
None      227
Name: count, dtype: int64
Caters
True     3557
False    1890
None        3
Name: count, dtype: int64


In [78]:
attributes_dummies = pd.get_dummies(attributes_expanded_subset)

In [79]:
attributes_dummies.head()

Unnamed: 0,Alcohol_None,Alcohol_beer_and_wine,Alcohol_full_bar,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,RestaurantsReservations_False,RestaurantsReservations_None,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,WiFi_None,WiFi_free,WiFi_no,WiFi_paid,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,RestaurantsAttire_None,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,RestaurantsTakeOut_False,RestaurantsTakeOut_None,RestaurantsTakeOut_True,Caters_False,Caters_None,Caters_True,NoiseLevel_None,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,GoodForKids_False,GoodForKids_None,GoodForKids_True,BikeParking_False,BikeParking_None,BikeParking_True,HasTV_False,HasTV_None,HasTV_True,Ambience_touristy_False,Ambience_touristy_True,Ambience_hipster_False,Ambience_hipster_True,Ambience_romantic_False,Ambience_romantic_True,Ambience_divey_False,Ambience_divey_True,Ambience_intimate_False,Ambience_intimate_True,Ambience_trendy_False,Ambience_trendy_True,Ambience_upscale_False,Ambience_upscale_True,Ambience_classy_False,Ambience_classy_True,Ambience_casual_False,Ambience_casual_True,GoodForMeal_latenight_False,GoodForMeal_latenight_True,GoodForMeal_lunch_False,GoodForMeal_lunch_True,GoodForMeal_dinner_False,GoodForMeal_dinner_True,GoodForMeal_brunch_False,GoodForMeal_brunch_True,GoodForMeal_breakfast_False,GoodForMeal_breakfast_True,BusinessParking_garage_False,BusinessParking_garage_True,BusinessParking_street_False,BusinessParking_street_True,BusinessParking_validated_False,BusinessParking_validated_True,BusinessParking_lot_False,BusinessParking_lot_True,BusinessParking_valet_False,BusinessParking_valet_True
0,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False
1,True,False,False,True,False,False,True,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,True,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,False,True,True,False
2,False,False,True,False,False,True,False,False,True,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,True,False,False,False,True,False,False,False,True,True,False,False,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,True,True,False
3,False,True,False,True,False,False,True,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,True,True,False,True,False,True,False,True,False,True,False,True,False,True,False,True,False,False,True,True,False,False,True,False,True,True,False,True,False,True,False,True,False,True,False,False,True,True,False
4,False,False,True,True,False,False,False,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,True,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False,False,True,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [80]:
df = pd.concat([df.drop(columns=['attributes']), attributes_dummies], axis=1)
df.shape

(8590, 93)

In [81]:
category_counts = df['categories'].str.split(', ').explode().value_counts()

threshold = len(category_counts) * 0.5
high_count_categories = category_counts[category_counts >= threshold].index

categories_expanded = df['categories'].str.get_dummies(sep=', ')
categories_expanded_subset = categories_expanded[high_count_categories]

categories_expanded_subset.shape

(8590, 34)

In [82]:
categories_expanded_subset.columns

Index(['Restaurants', 'Food', 'Nightlife', 'Bars', 'American (Traditional)',
       'Sandwiches', 'Breakfast & Brunch', 'Fast Food', 'Pizza',
       'American (New)', 'Burgers', 'Seafood', 'Italian', 'Mexican',
       'Coffee & Tea', 'Chicken Wings', 'Salad', 'Cafes',
       'Event Planning & Services', 'Sports Bars', 'Chinese', 'Delis',
       'Sushi Bars', 'Desserts', 'Specialty Food', 'Barbeque', 'Caterers',
       'Steakhouses', 'Japanese', 'Latin American', 'Bakeries',
       'Juice Bars & Smoothies', 'Asian Fusion', 'Diners'],
      dtype='object')

In [83]:
df = pd.concat([df, categories_expanded_subset], axis=1).drop('categories', axis=1)
df.shape

(8590, 126)

In [84]:
df = pd.concat([df, pd.get_dummies(df['city_updated'])], axis=1).drop('city_updated', axis=1)
df.shape

(8590, 168)

In [85]:
pd.set_option('display.max_columns', None)

df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)
df.head()

  df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)


Unnamed: 0,stars,review_count,avg_star_reviews,Alcohol_None,Alcohol_beer_and_wine,Alcohol_full_bar,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,RestaurantsReservations_False,RestaurantsReservations_None,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,WiFi_None,WiFi_free,WiFi_no,WiFi_paid,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsPriceRange2_3,RestaurantsPriceRange2_4,RestaurantsPriceRange2_None,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,RestaurantsAttire_None,RestaurantsAttire_casual,RestaurantsAttire_dressy,RestaurantsAttire_formal,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_None,BusinessAcceptsCreditCards_True,RestaurantsTakeOut_False,RestaurantsTakeOut_None,RestaurantsTakeOut_True,Caters_False,Caters_None,Caters_True,NoiseLevel_None,NoiseLevel_average,NoiseLevel_loud,NoiseLevel_quiet,NoiseLevel_very_loud,GoodForKids_False,GoodForKids_None,GoodForKids_True,BikeParking_False,BikeParking_None,BikeParking_True,HasTV_False,HasTV_None,HasTV_True,Ambience_touristy_False,Ambience_touristy_True,Ambience_hipster_False,Ambience_hipster_True,Ambience_romantic_False,Ambience_romantic_True,Ambience_divey_False,Ambience_divey_True,Ambience_intimate_False,Ambience_intimate_True,Ambience_trendy_False,Ambience_trendy_True,Ambience_upscale_False,Ambience_upscale_True,Ambience_classy_False,Ambience_classy_True,Ambience_casual_False,Ambience_casual_True,GoodForMeal_latenight_False,GoodForMeal_latenight_True,GoodForMeal_lunch_False,GoodForMeal_lunch_True,GoodForMeal_dinner_False,GoodForMeal_dinner_True,GoodForMeal_brunch_False,GoodForMeal_brunch_True,GoodForMeal_breakfast_False,GoodForMeal_breakfast_True,BusinessParking_garage_False,BusinessParking_garage_True,BusinessParking_street_False,BusinessParking_street_True,BusinessParking_validated_False,BusinessParking_validated_True,BusinessParking_lot_False,BusinessParking_lot_True,BusinessParking_valet_False,BusinessParking_valet_True,Restaurants,Food,Nightlife,Bars,American (Traditional),Sandwiches,Breakfast & Brunch,Fast Food,Pizza,American (New),Burgers,Seafood,Italian,Mexican,Coffee & Tea,Chicken Wings,Salad,Cafes,Event Planning & Services,Sports Bars,Chinese,Delis,Sushi Bars,Desserts,Specialty Food,Barbeque,Caterers,Steakhouses,Japanese,Latin American,Bakeries,Juice Bars & Smoothies,Asian Fusion,Diners,Apollo Beach,Balm,Brandon,Brooksville,Clearwater,Clearwater Beach,Dade City,Dover,Dunedin,Gibsonton,Holiday,Hudson,Indian Rocks Beach,Land O Lakes,Largo,Lithia,Lutz,New Port Richey,Odessa,Oldsmar,Ozona,Palm Harbor,Palmetto,Pinellas Park,Plant City,Port Richey,Riverview,Ruskin,Safety Harbor,Saint Leo,Saint Petersburg,San Antonio,Seffner,Seminole,Spring Hill,Sun City Center,Tampa,Tarpon Springs,Thonotosassa,Valrico,Wesley Chapel,Wimauma,Zephyrhills
0,4.0,10.0,4.090909,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,4.5,100.0,4.386792,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4.0,23.0,3.84,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4.0,35.0,4.162162,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,4.5,95.0,4.505051,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [88]:
df = df.dropna()

df.isnull().sum()

Unnamed: 0,0
stars,0
review_count,0
avg_star_reviews,0
Alcohol_None,0
Alcohol_beer_and_wine,0
...,...
Thonotosassa,0
Valrico,0
Wesley Chapel,0
Wimauma,0


In [89]:
df2 = df.copy()

In [90]:
output_filename = "FL_Restaurants_Business Attributes_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

out_path = directory + output_filename

df.to_csv(out_path, index=False)

In [91]:
X = df.drop('stars', axis=1)
y = df['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[-0.29933063,  0.96734671,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [ 0.7606965 , -0.06175629,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [ 8.29117821,  1.25773252,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       ...,
       [-0.40349514,  0.36852706,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [-0.50765966, -0.0822775 , -0.69895861, ...,  7.41476388,
        -0.03378687, -0.08064393],
       [ 1.21411851,  0.76826087,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393]])

Baseline Model

In [93]:
regression_model = LinearRegression()
regression_model.fit(X_train_scaled, y_train)

baseline_preds = regression_model.predict(X_test_scaled)
valid_ratings = np.array([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
rounded_preds = np.round(baseline_preds * 2) / 2

adjusted_preds = np.array([min(valid_ratings, key=lambda x: abs(x - pred)) for pred in rounded_preds])

y_test_classes = (y_test * 2).astype(int)
adjusted_preds_classes = (adjusted_preds * 2).astype(int)

baseline_accuracy = accuracy_score(y_test_classes, adjusted_preds_classes)
print("Baseline Model - Linear Regression (Rounded to Valid Ratings)")
print(f"Accuracy: {baseline_accuracy:.2f}")
print(f"Mean Squared Error: {mean_squared_error(y_test, adjusted_preds):.2f}")
print(f"R-squared: {r2_score(y_test, adjusted_preds):.2f}")


Baseline Model - Linear Regression (Rounded to Valid Ratings)
Accuracy: 0.94
Mean Squared Error: 0.01
R-squared: 0.98


In [94]:
X = df2.drop('stars', axis=1)
y = df2['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled

array([[-0.29933063,  0.96734671,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [ 0.7606965 , -0.06175629,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [ 8.29117821,  1.25773252,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       ...,
       [-0.40349514,  0.36852706,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393],
       [-0.50765966, -0.0822775 , -0.69895861, ...,  7.41476388,
        -0.03378687, -0.08064393],
       [ 1.21411851,  0.76826087,  1.43069988, ..., -0.13486606,
        -0.03378687, -0.08064393]])

In [95]:
y_train_class = (y_train * 2).astype(int)
y_test_class = (y_test * 2).astype(int)

classifier_model = RandomForestClassifier()
classifier_model.fit(X_train_scaled, y_train_class)

class_preds = classifier_model.predict(X_test_scaled)
class_accuracy = accuracy_score(y_test_class, class_preds)
print("Classification Model - Random Forest")
print(f"Accuracy: {class_accuracy:.2f}")
print(classification_report(y_test_class, class_preds))

Classification Model - Random Forest
Accuracy: 0.86
              precision    recall  f1-score   support

           2       0.00      0.00      0.00         7
           3       0.88      0.50      0.64        42
           4       0.61      0.58      0.59        59
           5       0.68      0.58      0.62       113
           6       0.73      0.86      0.79       170
           7       0.93      0.95      0.94       294
           8       0.96      0.98      0.97       362
           9       0.88      0.98      0.93       232
          10       1.00      0.30      0.46        37

    accuracy                           0.86      1316
   macro avg       0.74      0.64      0.66      1316
weighted avg       0.86      0.86      0.85      1316



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
