<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/xgboost_business_attributes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
filename = "FL_Restaurants_Business Attributes_Edited" + ".csv"
directory = '/content/drive/My Drive/Capstone Data Collection/'

path = directory + filename

chunk_iterator = pd.read_csv(path, chunksize=10000)

chunks = []

for chunk in chunk_iterator:
  chunks.append(chunk)

df = pd.concat(chunks, ignore_index=True)
df.shape

(8723, 166)

In [14]:
df['stars'].value_counts()

Unnamed: 0_level_0,count
stars,Unnamed: 1_level_1
4.0,2266
3.5,1902
4.5,1597
3.0,1149
2.5,804
2.0,425
5.0,327
1.5,219
1.0,34


In [15]:
df2 = df.copy()

XGBoost Classifier

In [17]:
rating_mapping = {1: 0, 1.5: 1, 2: 2, 2.5: 3, 3: 4, 3.5: 5, 4: 6, 4.5: 7, 5: 8}
df['rating_class'] = df['stars'].map(rating_mapping)

In [18]:
X = df.drop(columns=['stars', 'rating_class'])
y = df['rating_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [20]:
xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=9,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=100,
    eval_metric="mlogloss"
)

xgb_model.fit(X_train_scaled, y_train)

In [21]:
y_pred = xgb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.75      0.80         8
           1       0.88      0.85      0.87        53
           2       0.87      0.93      0.90        76
           3       0.97      0.95      0.96       178
           4       0.94      0.94      0.94       216
           5       0.95      0.95      0.95       385
           6       0.95      0.95      0.95       445
           7       0.96      0.97      0.96       321
           8       0.98      0.94      0.96        63

    accuracy                           0.95      1745
   macro avg       0.93      0.91      0.92      1745
weighted avg       0.95      0.95      0.95      1745



In [22]:
reverse_rating_mapping = {v: k for k, v in rating_mapping.items()}

adjusted_preds = [reverse_rating_mapping[pred] for pred in y_pred]

print("Adjusted Predictions (Ratings):")
print(adjusted_preds)

Adjusted Predictions (Ratings):
[2, 4.5, 4.5, 4, 4, 3.5, 3, 3.5, 2.5, 3, 4.5, 4, 4, 4, 3.5, 2.5, 4, 5, 4, 2.5, 4, 4, 3, 4.5, 2, 4, 2.5, 3.5, 5, 3.5, 4.5, 4.5, 4, 4.5, 2, 4, 4, 2.5, 3.5, 4, 5, 3.5, 4, 4, 3, 4.5, 4, 4, 4.5, 2.5, 4.5, 4, 1.5, 3, 4, 2, 4.5, 4, 4.5, 3.5, 3.5, 4, 3.5, 1.5, 3, 4, 4, 3.5, 3.5, 3.5, 4.5, 1.5, 2.5, 1.5, 4.5, 3.5, 3, 4.5, 3, 5, 3, 2, 4.5, 3.5, 4, 4, 3.5, 3.5, 3.5, 4.5, 3, 4.5, 2, 3.5, 3, 4.5, 5, 4, 3.5, 3.5, 3.5, 2.5, 3.5, 2.5, 3, 3.5, 4, 3.5, 4.5, 4, 2, 4, 3.5, 3.5, 2.5, 3, 3.5, 4.5, 3, 2.5, 4, 3, 5, 4, 3.5, 4.5, 4.5, 4, 4.5, 4.5, 2.5, 4, 3, 4.5, 2, 3.5, 1.5, 2.5, 3, 1.5, 3.5, 3.5, 3, 4, 4.5, 3.5, 3.5, 4, 3.5, 2.5, 3.5, 4, 1.5, 4, 2.5, 3, 3.5, 3.5, 3, 1.5, 2.5, 3.5, 2.5, 2, 4, 2.5, 4, 2, 4, 4, 4.5, 2.5, 3, 3, 3.5, 3.5, 3, 4, 3.5, 2.5, 4, 3, 2, 3.5, 2.5, 4.5, 3.5, 4, 4.5, 3, 3.5, 4.5, 4, 2.5, 4, 4.5, 4, 4, 3.5, 3, 3.5, 2.5, 5, 4.5, 4, 4.5, 2.5, 2, 4, 4.5, 2.5, 4.5, 4, 4.5, 3, 4.5, 3, 3.5, 3.5, 4.5, 4, 3.5, 4.5, 2.5, 5, 3.5, 5, 2.5, 3.5, 4.5, 3, 3, 3.5, 4, 4, 4, 4

XGBoost Regressor

In [24]:
X = df2.drop(columns=['stars'])
y = df2['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

y_pred_rounded = np.round(y_pred * 2) / 2

comparison_df = X_test.copy()
comparison_df['Actual'] = y_test.values
comparison_df['Predicted'] = y_pred
comparison_df['Predicted_Rounded'] = y_pred_rounded

mse = mean_squared_error(y_test, y_pred_rounded)
r2 = r2_score(y_test, y_pred_rounded)

print("Original Predictions:", y_pred)
print("Rounded Predictions:", y_pred_rounded)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

accuracy = np.mean(y_pred_rounded == y_test.values)
print(f"Accuracy: {accuracy * 100:.2f}%")

Original Predictions: [1.7784194 4.7789164 4.496122  ... 2.4312449 4.045908  4.4212914]
Rounded Predictions: [2.  5.  4.5 ... 2.5 4.  4.5]
Mean Squared Error: 0.013467048710601719
R^2 Score: 0.9809224051423789
Accuracy: 94.61%


In [26]:
select_fields = ['review_count', 'avg_star_reviews', 'Actual', 'Predicted', 'Predicted_Rounded']

comparison_df[select_fields].head()

Unnamed: 0,review_count,avg_star_reviews,Actual,Predicted,Predicted_Rounded
601,13,1.923077,2.0,1.778419,2.0
6092,8,4.75,5.0,4.778916,5.0
8052,313,4.697059,4.5,4.496122,4.5
2441,494,3.984674,4.0,3.988284,4.0
8454,335,3.835694,4.0,3.984683,4.0
