In [1]:
#this step is needed to run in google colab to prevent dependency issues
!pip uninstall numpy scikit-surprise -y
!pip install scikit-surprise
!pip install numpy==1.23.5


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mCollecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting numpy>=1.19.5 (from scikit-surprise)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for colle

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split
from surprise import accuracy

**Import files**

Import files and run some of the data preprocessing steps written by Shreyas

In [2]:
#preprocessed files generated by code from shreyas
#train = pd.read_csv('/content/drive/MyDrive/train.csv')
#test = pd.read_csv('/content/drive/MyDrive/test.csv')
#val = pd.read_csv('/content/drive/MyDrive/val.csv')
train.pd.read_csv('../data/train.csv')
test.pd.read_csv('../data/test.csv')
val.pd.read_csv('../data/val.csv')

In [3]:
#code from shreyas, use the same user and restaurant features
def create_dataset(data):
    # User features
    user_features = {
        'review_count': data['review_count_norm_x'].values.astype(np.float32),
        'average_stars': data['average_stars_norm'].values.astype(np.float32),
        'fans': data['fans_norm'].values.astype(np.float32),
        'friends_count': data['friends_count_norm'].values.astype(np.float32),
        'elite': data['elite_binary'].values.astype(np.float32)
    }
    #added this section to convert parking column values to 1,0 due to error
    parking_columns = ['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']
    for col in parking_columns:
        data[col] = data[col].map({'True': 1.0, 'False': 0.0, True: 1.0, False: 0.0}).fillna(0.0)

    # Restaurant features
    rest_features = {
        'stars': data['stars_norm'].values.astype(np.float32),
        'review_count': data['review_count_norm_y'].values.astype(np.float32),
        'lat': data['lat_norm'].values.astype(np.float32),
        'lon': data['lon_norm'].values.astype(np.float32),
        'categories': data[[f'cat_{i}' for i in range(50)]].values.astype(np.float32),
        'parking': data[['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']].values.astype(np.float32)
    }

    # Labels (target variable)
    labels = data['stars'].values.astype(np.float32)

    return user_features, rest_features, labels

In [4]:
#code from shreyas - research.ipynb
train_user, train_rest, train_labels = create_dataset(train)
val_user, val_rest, val_labels = create_dataset(val)
test_user, test_rest, test_labels = create_dataset(test)

**Model 1: XGBoost**

XGBoost with a content based approach

In [5]:
#combine features to create a model that uses restaurant features
def combine_features(user_features, rest_features):
    # Convert dictionaries to arrays
    user_array = np.column_stack([user_features[k] for k in user_features.keys()])
    rest_array = np.column_stack([rest_features[k] for k in rest_features.keys() if k != 'categories'])

    # Flatten categories
    categories_array = rest_features['categories']

    # Combine all features
    combined_features = np.column_stack([user_array, rest_array, categories_array])
    return combined_features

X_train = combine_features(train_user, train_rest)
X_val = combine_features(val_user, val_rest)
X_test = combine_features(test_user, test_rest)

y_train = train_labels
y_val = val_labels
y_test = test_labels

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# create an xgb model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    seed=42
)

# kept best parameters
param_grid = {
    'max_depth': [6],
    'learning_rate': [0.1],
    'n_estimators': [100],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
    'gamma': [0.1]
}

#use gridsearch
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# run grid_search for the besst model
grid_search.fit(X_train, y_train)

# save best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [7]:
# evalute on validation set and test set
xgb_val_preds = best_model.predict(X_val)
xgb_test_preds = best_model.predict(X_test)

#calculate rmse
xgb_val_mse = mean_squared_error(y_val, xgb_val_preds)
xgb_test_mse = mean_squared_error(y_test, xgb_test_preds)
xgb_val_rmse = np.sqrt(xgb_val_mse)
xgb_test_rmse = np.sqrt(xgb_test_mse)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Validation RMSE: {xgb_val_rmse:.4f}")
print(f"Test RMSE: {xgb_test_rmse:.4f}")

Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Validation RMSE: 1.0689
Test RMSE: 1.0701


**Model 2: SVD++**

SVD++ for collaborative filtering

In [8]:
from surprise.model_selection import GridSearchCV

# Prepare the data
train_cf = train[['user_id', 'business_id', 'stars']]
val_cf = val[['user_id', 'business_id', 'stars']]
test_cf = test[['user_id', 'business_id', 'stars']]

reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_cf, reader)

# some of the better params
param_grid = {
    'n_factors': [10, 0],
    'n_epochs': [20],
    'lr_all': [0.01],
    'reg_all': [0.3],
    'random_state': [42]
}

# Set up GridSearchCV using SVDpp
gs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=['rmse', 'mae'],
    cv=5,
    refit='rmse',
    n_jobs=1,
    joblib_verbose=2
)

# Fit the grid search
gs.fit(train_data)

# Print the results
print("Best RMSE:", gs.best_score['rmse'])
print("Best MAE:", gs.best_score['mae'])
print("Best params:", gs.best_params['rmse'])


Best RMSE: 1.3439488104648813
Best MAE: 1.0996196454881395
Best params: {'n_factors': 0, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.3, 'random_state': 42}


In [9]:
def predict_svd(svd_model, df):
    testset = list(df.itertuples(index=False, name=None))
    predictions = svd_model.test(testset)
    return np.array([pred.est for pred in predictions])

In [10]:
svd = gs.best_estimator['rmse']

# Predict using SVD
svd_val_preds = predict_svd(svd, val_cf)
svd_test_preds = predict_svd(svd, test_cf)

# Actual ratings
val_true = val_cf['stars'].values
test_true = test_cf['stars'].values

# Compute RMSE
val_rmse = np.sqrt(mean_squared_error(val_true, svd_val_preds))
test_rmse = np.sqrt(mean_squared_error(test_true, svd_test_preds))

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")


Validation RMSE: 1.3332
Test RMSE: 1.3331


**1st Hybrid Approach: Random Forest**

In [11]:
def create_meta_features(svd_preds, xgb_preds):
    return pd.DataFrame({
        'svd_pred': svd_preds,
        'xgb_pred': xgb_preds
    })

def prepare_meta_data(svd_model, xgb_model, X, df_cf):
    svd_preds = predict_svd(svd_model, df_cf)
    xgb_preds = xgb_model.predict(X)
    meta_X = create_meta_features(svd_preds, xgb_preds)
    meta_y = df_cf['stars'].values
    return meta_X, meta_y


# Prepare meta-features for train, test, val
X_meta_train, y_meta_train = prepare_meta_data(svd, best_model, X_train, train_cf)
X_meta_val, y_meta_val = prepare_meta_data(svd, best_model, X_val, val_cf)
X_meta_test, y_meta_test = prepare_meta_data(svd, best_model, X_test, test_cf)

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

#Kept best hyperparameters
param_grid = {
    'n_estimators': [150],
    'max_depth': [10],
    'min_samples_split': [30],
    'min_samples_leaf': [4],
    'random_state': [42]
}

#call randomforest model
rf = RandomForestRegressor()

# set up gridsearch
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2,
                           scoring='neg_mean_squared_error')

#fit model
grid_search.fit(X_meta_train, y_meta_train)

#save best model
best_rf_model = grid_search.best_estimator_

# Predict on the test set using the tuned Random Forest model
rf_preds_val = best_rf_model.predict(X_meta_val)
rf_preds_test = best_rf_model.predict(X_meta_test)
rf_rmse_val = np.sqrt(mean_squared_error(y_meta_val, rf_preds_val))
rf_rmse_test = np.sqrt(mean_squared_error(y_meta_test, rf_preds_test))

# Print out the best hyperparameters and RMSE
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Test RMSE (Random Forest): {rf_rmse_test:.4f}")
print(f"Val RMSE (Random Forest): {rf_rmse_val:.4f}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 30, 'n_estimators': 150, 'random_state': 42}
Test RMSE (Random Forest): 1.3283
Val RMSE (Random Forest): 1.3292


**2nd Hybird Approach: Ensemble**

In [13]:
best_alpha = None
best_rmse = float("inf")

for alpha in np.arange(0.8, 1.00, 0.01):
    val_blend = alpha * xgb_val_preds + (1 - alpha) * np.array(svd_val_preds)
    rmse = np.sqrt(mean_squared_error(y_meta_val, val_blend))

    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

print(f"Best alpha: {best_alpha:.3f}")
print(f"Best Validation RMSE: {best_rmse:.4f}")

Best alpha: 0.990
Best Validation RMSE: 1.0689


In [14]:
y_val = val_cf['stars'].values
y_test = test_cf['stars'].values

# Blend SVD and XGBoost predictions using best alpha
val_ensemble_preds = best_alpha * xgb_val_preds + (1 - best_alpha) * svd_val_preds
test_ensemble_preds = best_alpha * xgb_test_preds + (1 - best_alpha) * svd_test_preds

# Compute RMSE
val_rmse = np.sqrt(mean_squared_error(y_val, val_ensemble_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_ensemble_preds))

print(f"Validation RMSE (Ensemble): {val_rmse:.4f}")
print(f"Test RMSE (Ensemble): {test_rmse:.4f}")

Validation RMSE (Ensemble): 1.0689
Test RMSE (Ensemble): 1.0701


**3rd Hybrid Approach: Residuals**

In [15]:
#XGBoost predictions
#val and test already explicitly called
xgb_train_preds = best_model.predict(X_train)

#compute residuals
residuals_train = y_train - xgb_train_preds

#build Surprise dataset with residuals
train_resid_df = train_cf.copy()
train_resid_df['stars'] = residuals_train

data = Dataset.load_from_df(train_resid_df[['user_id', 'business_id', 'stars']], reader)
trainset_resid = data.build_full_trainset()

#train SVD++ on residuals
svd_resid = SVDpp()
svd_resid.fit(trainset_resid)

#predict residuals on val
val_resid_testset = list(val_cf.itertuples(index=False, name=None))
svd_resid_preds = svd_resid.test(val_resid_testset)
svd_resid_vals = np.array([pred.est for pred in svd_resid_preds])

#predict residuals on test
test_resid_testset = list(test_cf.itertuples(index=False, name=None))
svd_resid_test_preds = svd_resid.test(test_resid_testset)
svd_resid_test_vals = np.array([pred.est for pred in svd_resid_test_preds])

#calculate rmse for validation
residual_val_blend = xgb_val_preds + svd_resid_vals
val_rmse = np.sqrt(mean_squared_error(y_val, residual_val_blend))
print(f"Validation RMSE (Residual Hybrid): {val_rmse:.4f}")

#calculate rmse for test
residual_test_blend = xgb_test_preds + svd_resid_test_vals
test_rmse = np.sqrt(mean_squared_error(y_test, residual_test_blend))
print(f"Test RMSE (Residual Hybrid): {test_rmse:.4f}")

Validation RMSE (Residual Hybrid): 1.4623
Test RMSE (Residual Hybrid): 1.4640
