<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD_MLE_Projects/blob/main/hyperparameter_tuning_business_attributes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [3]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
filename = "FL_Restaurants_Business Attributes_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.shape

(8723, 166)

In [7]:
df2 = df.copy()

XGBoost Regressor Hyperparameter Tuning

In [10]:
X = df.drop(columns=['stars'])
y = df['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def objective(trial):
    params = {
      "n_estimators": trial.suggest_int("n_estimators", 50, 500),
      "max_depth": trial.suggest_int("max_depth", 3, 15),
      "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
      "subsample": trial.suggest_float("subsample", 0.6, 1.0),
      "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
      "random_state": 42,
      "objective": "reg:squarederror"
  }

    model = XGBRegressor(**params)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    y_pred_rounded = np.round(y_pred * 2) / 2

    mse = mean_squared_error(y_test, y_pred_rounded)
    return mse


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best Parameters:", study.best_params)
print("Best MSE:", study.best_value)


[I 2024-11-19 04:12:19,943] A new study created in memory with name: no-name-c163d3dd-23d9-40f9-87c6-e15521640425
[I 2024-11-19 04:12:20,690] Trial 0 finished with value: 0.01332378223495702 and parameters: {'n_estimators': 296, 'max_depth': 3, 'learning_rate': 0.02268695840736255, 'subsample': 0.7440569186355832, 'colsample_bytree': 0.6374303685704245}. Best is trial 0 with value: 0.01332378223495702.
[I 2024-11-19 04:12:24,195] Trial 1 finished with value: 0.013610315186246419 and parameters: {'n_estimators': 335, 'max_depth': 11, 'learning_rate': 0.03663940014520137, 'subsample': 0.8792377861498855, 'colsample_bytree': 0.8714437638671195}. Best is trial 0 with value: 0.01332378223495702.
[I 2024-11-19 04:12:28,335] Trial 2 finished with value: 0.017765042979942695 and parameters: {'n_estimators': 378, 'max_depth': 15, 'learning_rate': 0.12232236431529196, 'subsample': 0.830944498995797, 'colsample_bytree': 0.6760802378139833}. Best is trial 0 with value: 0.01332378223495702.
[I 2024

Best Parameters: {'n_estimators': 95, 'max_depth': 4, 'learning_rate': 0.2110467653562664, 'subsample': 0.8482197585993168, 'colsample_bytree': 0.7215067461439995}
Best MSE: 0.012034383954154728


In [11]:

best_params = study.best_params

print("Best Parameters from Optuna:", best_params)


X = df.drop(columns=['stars'])
y = df['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = XGBRegressor(
    objective='reg:squarederror',
    **best_params
)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

y_pred_rounded = np.round(y_pred * 2) / 2

mse = mean_squared_error(y_test, y_pred_rounded)
r2 = r2_score(y_test, y_pred_rounded)

print("Original Predictions:", y_pred)
print("Rounded Predictions:", y_pred_rounded)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

accuracy = np.mean(y_pred_rounded == y_test.values)
print(f"Accuracy: {accuracy * 100:.2f}%")


Best Parameters from Optuna: {'n_estimators': 95, 'max_depth': 4, 'learning_rate': 0.2110467653562664, 'subsample': 0.8482197585993168, 'colsample_bytree': 0.7215067461439995}
Original Predictions: [2.0005066 4.76088   4.496566  ... 2.5339258 4.03099   4.4266686]
Rounded Predictions: [2.  5.  4.5 ... 2.5 4.  4.5]
Mean Squared Error: 0.012464183381088826
R^2 Score: 0.9823430770998614
Accuracy: 95.01%


XGBoost Classifier Hyperparameter Tuning

In [12]:
rating_mapping = {1: 0, 1.5: 1, 2: 2, 2.5: 3, 3: 4, 3.5: 5, 4: 6, 4.5: 7, 5: 8}
df2['rating_class'] = df2['stars'].map(rating_mapping)

X = df2.drop(columns=['stars', 'rating_class'])
y = df2['rating_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
def objective(trial):
    params = {
        "objective": "multi:softmax",
        "num_class": 9,
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "eval_metric": "mlogloss",
    }


    model = XGBClassifier(**params)

    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)


    accuracy = accuracy_score(y_test, y_pred)

    return accuracy


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)


best_params = study.best_params
print("Best Parameters from Optuna:", best_params)


[I 2024-11-19 04:20:19,236] A new study created in memory with name: no-name-b3127d7e-128f-409e-9694-6b270bc3234d
[I 2024-11-19 04:20:29,671] Trial 0 finished with value: 0.9449856733524356 and parameters: {'max_depth': 7, 'learning_rate': 0.041206101039237664, 'n_estimators': 287, 'subsample': 0.8067435477572664, 'colsample_bytree': 0.9961600876081583}. Best is trial 0 with value: 0.9449856733524356.
[I 2024-11-19 04:20:32,286] Trial 1 finished with value: 0.9467048710601719 and parameters: {'max_depth': 5, 'learning_rate': 0.1199821060425487, 'n_estimators': 109, 'subsample': 0.8231854166140207, 'colsample_bytree': 0.6293471436239225}. Best is trial 1 with value: 0.9467048710601719.
[I 2024-11-19 04:20:42,791] Trial 2 finished with value: 0.9444126074498568 and parameters: {'max_depth': 7, 'learning_rate': 0.060527834113890774, 'n_estimators': 274, 'subsample': 0.9135209583658989, 'colsample_bytree': 0.9630819135946896}. Best is trial 1 with value: 0.9467048710601719.
[I 2024-11-19 0

Best Parameters from Optuna: {'max_depth': 13, 'learning_rate': 0.019062812804156656, 'n_estimators': 65, 'subsample': 0.6520707479517559, 'colsample_bytree': 0.6756805029818673}


In [14]:
best_model = XGBClassifier(**best_params)


best_model.fit(X_train_scaled, y_train)

y_pred = best_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the optimized model: {accuracy * 100:.2f}%")

Accuracy of the optimized model: 95.13%
