In [2]:
#this step is needed to run in google colab to prevent dependency issues
!pip uninstall numpy scikit-surprise -y
!pip install scikit-surprise
!pip install numpy==1.23.5


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
[0mCollecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting numpy>=1.19.5 (from scikit-surprise)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for colle

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV


**Import files**

Import files and run some of the data preprocessing steps written by Shreyas

In [3]:
#preprocessed files generated by code from shreyas
#train = pd.read_csv('/content/drive/MyDrive/train.csv')
#test = pd.read_csv('/content/drive/MyDrive/test.csv')
#val = pd.read_csv('/content/drive/MyDrive/val.csv')
train=pd.read_csv('../Two tower model/Restaurant_Recommendation/data/train.csv')
test=pd.read_csv('../Two tower model/Restaurant_Recommendation/data/test.csv')
val=pd.read_csv('../Two tower model/Restaurant_Recommendation/data/val.csv')

In [4]:
#code from shreyas, use the same user and restaurant features
def create_dataset(data):
    # User features
    user_features = {
        'review_count': data['review_count_norm_x'].values.astype(np.float32),
        'average_stars': data['average_stars_norm'].values.astype(np.float32),
        'fans': data['fans_norm'].values.astype(np.float32),
        'friends_count': data['friends_count_norm'].values.astype(np.float32),
        'elite': data['elite_binary'].values.astype(np.float32)
    }
    #added this section to convert parking column values to 1,0 due to error
    parking_columns = ['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']
    for col in parking_columns:
        data[col] = data[col].map({'True': 1.0, 'False': 0.0, True: 1.0, False: 0.0}).fillna(0.0)

    # Restaurant features
    rest_features = {
        'stars': data['stars_norm'].values.astype(np.float32),
        'review_count': data['review_count_norm_y'].values.astype(np.float32),
        'lat': data['lat_norm'].values.astype(np.float32),
        'lon': data['lon_norm'].values.astype(np.float32),
        'categories': data[[f'cat_{i}' for i in range(50)]].values.astype(np.float32),
        'parking': data[['park_garage', 'park_street', 'park_validated', 'park_lot', 'park_valet']].values.astype(np.float32)
    }

    # Labels (target variable)
    labels = data['stars'].values.astype(np.float32)

    return user_features, rest_features, labels

In [5]:
#code from shreyas - research.ipynb
train_user, train_rest, train_labels = create_dataset(train)
val_user, val_rest, val_labels = create_dataset(val)
test_user, test_rest, test_labels = create_dataset(test)

**Model 1: XGBoost**

XGBoost with a content based approach

In [6]:
#combine features to create a model that uses restaurant features
def combine_features(user_features, rest_features):
    # Convert dictionaries to arrays
    user_array = np.column_stack([user_features[k] for k in user_features.keys()])
    rest_array = np.column_stack([rest_features[k] for k in rest_features.keys() if k != 'categories'])

    # Flatten categories
    categories_array = rest_features['categories']

    # Combine all features
    combined_features = np.column_stack([user_array, rest_array, categories_array])
    return combined_features

X_train = combine_features(train_user, train_rest)
X_val = combine_features(val_user, val_rest)
X_test = combine_features(test_user, test_rest)

y_train = train_labels
y_val = val_labels
y_test = test_labels

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# create an xgb model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    seed=42
)

# kept best parameters
param_grid = {
    'max_depth': [5],
    'learning_rate': [0.1],
    'n_estimators': [100],
    'subsample': [0.8],
    'colsample_bytree': [1.0],
    'gamma': [0.1]
}

#use gridsearch
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# run grid_search for the besst model
grid_search.fit(X_train, y_train)

# save best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# evalute on validation set and test set
xgb_val_preds = best_model.predict(X_val)
xgb_test_preds = best_model.predict(X_test)

#calculate rmse
xgb_val_mse = mean_squared_error(y_val, xgb_val_preds)
xgb_test_mse = mean_squared_error(y_test, xgb_test_preds)
xgb_val_rmse = np.sqrt(xgb_val_mse)
xgb_test_rmse = np.sqrt(xgb_test_mse)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Validation RMSE: {xgb_val_rmse:.4f}")
#calculate mae
xgb_test_mae = mean_absolute_error(y_test, xgb_test_preds)
print(f"Test MAE: {xgb_test_mae:.4f}")

Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Validation RMSE: 1.0687
Test MAE: 0.8048


**Model 2: SVD++**

SVD++ for collaborative filtering

In [9]:
from surprise.model_selection import GridSearchCV

# Prepare the data
train_cf = train[['user_id', 'business_id', 'stars']]
val_cf = val[['user_id', 'business_id', 'stars']]
test_cf = test[['user_id', 'business_id', 'stars']]

reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train_cf, reader)

# some of the better params
param_grid = {
    'n_factors': [10, 0],
    'n_epochs': [20],
    'lr_all': [0.01],
    'reg_all': [0.3],
    'random_state': [42]
}

# Set up GridSearchCV using SVDpp
gs = GridSearchCV(
    SVDpp,
    param_grid,
    measures=['rmse', 'mae'],
    cv=5,
    refit='rmse',
    n_jobs=1,
    joblib_verbose=2
)

# Fit the grid search
gs.fit(train_data)

# Print the results
print("Best RMSE:", gs.best_score['rmse'])
print("Best MAE:", gs.best_score['mae'])
print("Best params:", gs.best_params['rmse'])


Best RMSE: 1.3438052457114833
Best MAE: 1.0994207248434331
Best params: {'n_factors': 0, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.3, 'random_state': 42}


In [10]:
def predict_svd(svd_model, df):
    testset = list(df.itertuples(index=False, name=None))
    predictions = svd_model.test(testset)
    return np.array([pred.est for pred in predictions])

In [11]:
svdpp = gs.best_estimator['rmse']

# Predict using SVD
svdpp_val_preds = predict_svd(svdpp, val_cf)
svdpp_test_preds = predict_svd(svdpp, test_cf)

# Actual ratings
val_true = val_cf['stars'].values
test_true = test_cf['stars'].values

# Compute RMSE
val_rmse = np.sqrt(mean_squared_error(val_true, svdpp_val_preds))
test_rmse = np.sqrt(mean_squared_error(test_true, svdpp_test_preds))

print(f"Validation RMSE: {val_rmse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
test_mae = mean_absolute_error(test_true, svdpp_test_preds)
print(f"Test MAE: {test_mae:.4f}")

Validation RMSE: 1.3332
Test RMSE: 1.3331
Test MAE: 1.0888


**1st Hybrid Approach: Random Forest**

In [12]:
def create_meta_features(svdpp_preds, xgb_preds):
    return pd.DataFrame({
        'svdpp_pred': svdpp_preds,
        'xgb_pred': xgb_preds
    })

def prepare_meta_data(svdpp_model, xgb_model, X, df_cf):
    svdpp_preds = predict_svd(svdpp_model, df_cf)
    xgb_preds = xgb_model.predict(X)
    meta_X = create_meta_features(svdpp_preds, xgb_preds)
    meta_y = df_cf['stars'].values
    return meta_X, meta_y


# Prepare meta-features for train, test, val
X_meta_train, y_meta_train = prepare_meta_data(svdpp, best_model, X_train, train_cf)
X_meta_val, y_meta_val = prepare_meta_data(svdpp, best_model, X_val, val_cf)
X_meta_test, y_meta_test = prepare_meta_data(svdpp, best_model, X_test, test_cf)

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

#Kept best hyperparameters
param_grid = {
    'n_estimators': [150],
    'max_depth': [10],
    'min_samples_split': [30],
    'min_samples_leaf': [4],
    'random_state': [42]
}

#call randomforest model
rf = RandomForestRegressor()

# set up gridsearch
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='neg_mean_squared_error')

#fit model
grid_search.fit(X_meta_train, y_meta_train)

#save best model
best_rf_model = grid_search.best_estimator_

# Predict on the test set using the tuned Random Forest model
rf_preds_val = best_rf_model.predict(X_meta_val)
rf_preds_test = best_rf_model.predict(X_meta_test)
rf_rmse_val = np.sqrt(mean_squared_error(y_meta_val, rf_preds_val))
rf_rmse_test = np.sqrt(mean_squared_error(y_meta_test, rf_preds_test))

# Print out the best hyperparameters and RMSE
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Test RMSE (Random Forest): {rf_rmse_test:.4f}")
print(f"Val RMSE (Random Forest): {rf_rmse_val:.4f}")
#mae
rf_mae_test = mean_absolute_error(y_meta_test, rf_preds_test)
print(f"Test MAE (Random Forest): {rf_mae_test:.4f}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 30, 'n_estimators': 150, 'random_state': 42}
Test RMSE (Random Forest): 1.3334
Val RMSE (Random Forest): 1.3343
Test MAE (Random Forest): 0.9803


**2nd Hybird Approach: Ensemble**

In [14]:
best_alpha = None
best_rmse = float("inf")

for alpha in np.arange(0.9, 1.10, 0.005):
    val_blend = alpha * xgb_val_preds + (1 - alpha) * np.array(svdpp_val_preds)
    rmse = np.sqrt(mean_squared_error(y_meta_val, val_blend))

    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

print(f"Best alpha: {best_alpha:.3f}")
print(f"Best RMSE: {best_rmse:.4f}")

Best alpha: 0.995
Best RMSE: 1.0687


In [15]:
y_val = val_cf['stars'].values
y_test = test_cf['stars'].values

# Blend SVD and XGBoost predictions using best alpha
val_ensemble_preds = best_alpha * xgb_val_preds + (1 - best_alpha) * svdpp_val_preds
test_ensemble_preds = best_alpha * xgb_test_preds + (1 - best_alpha) * svdpp_test_preds

# Compute RMSE
val_rmse = np.sqrt(mean_squared_error(y_val, val_ensemble_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_ensemble_preds))
test_mae = mean_absolute_error(y_test, test_ensemble_preds)

print(f"Validation RMSE (Ensemble): {val_rmse:.4f}")
print(f"Test RMSE (Ensemble): {test_rmse:.4f}")
print(f"Test MAE (Ensemble): {test_mae:.4f}")

Validation RMSE (Ensemble): 1.0687
Test RMSE (Ensemble): 1.0699
Test MAE (Ensemble): 0.8054


**3rd Hybrid Approach: Residuals**

Utilize XGB predictions and train residuals on SVD++

In [16]:
#Utilize XGBoost predictions and train residuals using SVD++
#val and test already explicitly called
xgb_train_preds = best_model.predict(X_train)

#compute residuals
residuals_train = y_train - xgb_train_preds

#build Surprise dataset with residuals
train_resid_df = train_cf.copy()
train_resid_df['stars'] = residuals_train

data = Dataset.load_from_df(train_resid_df[['user_id', 'business_id', 'stars']], reader)
trainset_resid = data.build_full_trainset()

#train SVD++ on residuals
svdpp_resid = SVDpp()
svdpp_resid.fit(trainset_resid)

#predict residuals on val
val_resid_testset = list(val_cf.itertuples(index=False, name=None))
svdpp_resid_preds = svdpp_resid.test(val_resid_testset)
svdpp_resid_vals = np.array([pred.est for pred in svdpp_resid_preds])

#predict residuals on test
test_resid_testset = list(test_cf.itertuples(index=False, name=None))
svdpp_resid_test_preds = svdpp_resid.test(test_resid_testset)
svdpp_resid_test_vals = np.array([pred.est for pred in svdpp_resid_test_preds])

#calculate rmse for validation
residual_val_blend = xgb_val_preds + svdpp_resid_vals
val_rmse = np.sqrt(mean_squared_error(y_val, residual_val_blend))
print(f"Validation RMSE (Residual Hybrid): {val_rmse:.4f}")

#calculate rmse for test
residual_test_blend = xgb_test_preds + svdpp_resid_test_vals
test_rmse = np.sqrt(mean_squared_error(y_test, residual_test_blend))
test_mae = mean_absolute_error(y_test, residual_test_blend)
print(f"Test RMSE (Residual Hybrid): {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")

Validation RMSE (Residual Hybrid): 1.4622
Test RMSE (Residual Hybrid): 1.4638
Test MAE: 1.1401


**3.5 Residuals Reversed**

Utilize SVD++ predictions and train residuals on XGB

In [17]:
#Opposite, use SVD++ and train residuals on XGBoost
from xgboost import XGBRegressor

#Use SVD++ predictions and calculate residual
svdpp_train_preds = predict_svd(svdpp, train_cf)
residuals_train = y_train - svdpp_train_preds

#initialize XGBoost
xgb_resid = XGBRegressor(random_state=42)

# hyperparameter grid
param_grid = {
    'n_estimators': [100],
    'max_depth': [0,5],
    'learning_rate': [0.05],
    'subsample': [0.8],
    'colsample_bytree': [0.9]
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_resid, param_grid=param_grid, cv=2, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# fit dataset
grid_search.fit(X_train, residuals_train)

# Get the best model from the grid search
best_xgb_resid = grid_search.best_estimator_

Fitting 2 folds for each of 2 candidates, totalling 4 fits




In [18]:
# Predict SVD++ on val/test
val_testset = list(val_cf.itertuples(index=False, name=None))
test_testset = list(test_cf.itertuples(index=False, name=None))

svdpp_val_preds = svdpp.test(val_testset)
svdpp_test_preds = svdpp.test(test_testset)

# check extracted values
svdpp_val_vals = np.array([pred.est for pred in svdpp_val_preds])
svdpp_test_vals = np.array([pred.est for pred in svdpp_test_preds])

# Predict residuals using XGBoost
xgb_val_resid_preds = best_xgb_resid.predict(X_val)
xgb_test_resid_preds = best_xgb_resid.predict(X_test)

# final step, predict with SVD++ and XGBoosted residuals
final_val_preds = svdpp_val_vals + xgb_val_resid_preds
final_test_preds = svdpp_test_vals + xgb_test_resid_preds

# RMSE to evaluate
val_rmse = np.sqrt(mean_squared_error(y_val, final_val_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, final_test_preds))

print("Best hyperparameters:", grid_search.best_params_)
print(f"Validation RMSE (SVD++ + XGB residuals): {val_rmse:.4f}")
print(f"Test RMSE (SVD++ + XGB residuals): {test_rmse:.4f}")
test_mae = mean_absolute_error(y_test, final_test_preds)
print(f"Test MAE: {test_mae:.4f}")

Best hyperparameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Validation RMSE (SVD++ + XGB residuals): 1.1233
Test RMSE (SVD++ + XGB residuals): 1.1239
Test MAE: 0.8858


**SVD Model**

Utilized SVD++ instead. Base model to showcase another baseline/result

In [19]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

train_cf = train[['user_id', 'business_id', 'stars']]
val_cf = val[['user_id', 'business_id', 'stars']]
test_cf = test[['user_id', 'business_id', 'stars']]

reader = Reader(rating_scale=(1, 5))

train_data = Dataset.load_from_df(train_cf, reader)
trainset = train_data.build_full_trainset()

param_grid = {
    'n_factors': [10, 0],
    'n_epochs': [10],
    'lr_all': [0.01],
    'reg_all': [0.3],
    'random_state': [42]
}

gs = GridSearchCV(
    SVD,
    param_grid,
    measures=['rmse', 'mae'],
    cv=3,
    refit='rmse',
    n_jobs=1,
    joblib_verbose=2
)

# Fit gridsearch
gs.fit(train_data)

# Results
print("Best RMSE:", gs.best_score['rmse'])
print("Best MAE:", gs.best_score['mae'])
print("Best params:", gs.best_params['rmse'])

# Get best model from grid search
svd = gs.best_estimator['rmse']

# Prepare validation testset
val_testset = list(val_cf.itertuples(index=False, name=None))
test_testset = list(test_cf.itertuples(index=False, name=None))


# Make predictions on validation set
val_predictions = svd.test(val_testset)
test_predictions = svd.test(test_testset)

# Compute RMSE
val_rmse = accuracy.rmse(val_predictions)
test_rmse = accuracy.rmse(test_predictions)
print(f"Validation RMSE: {val_rmse}")
print(f"Test RMSE: {test_rmse}")
test_mae = accuracy.mae(test_predictions)
print(f"Test MAE: {test_mae}")

Best RMSE: 1.3525376642668718
Best MAE: 1.1103477024857618
Best params: {'n_factors': 0, 'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.3, 'random_state': 42}
RMSE: 1.3379
RMSE: 1.3379
Validation RMSE: 1.3379098727140595
Test RMSE: 1.337857236160567
MAE:  1.0960
Test MAE: 1.0960446732848521


**Hybrid Model**

Utilize one basic hybrid model

In [20]:
X_meta_train, y_meta_train = prepare_meta_data(svd, best_model, X_train, train_cf)
X_meta_val, y_meta_val = prepare_meta_data(svd, best_model, X_val, val_cf)
X_meta_test, y_meta_test = prepare_meta_data(svd, best_model, X_test, test_cf)

In [21]:
best_alpha = None
best_rmse = float("inf")

val_predictions = np.array([pred.est for pred in svd.test(val_testset)])
test_predictions = np.array([test.est for test in svd.test(test_testset)])

for alpha in np.arange(0.9, 1.10, 0.005):
    val_blend = alpha * xgb_val_preds + (1 - alpha) * np.array(val_predictions)
    rmse = np.sqrt(mean_squared_error(y_meta_val, val_blend))

    if rmse < best_rmse:
        best_rmse = rmse
        best_alpha = alpha

print(f"Best alpha: {best_alpha:.3f}")
print(f"Best RMSE: {best_rmse:.4f}")

Best alpha: 0.995
Best RMSE: 1.0687


In [22]:
y_val = val_cf['stars'].values
y_test = test_cf['stars'].values

# Blend SVD and XGBoost predictions using best alpha
val_ensemble_preds = best_alpha * xgb_val_preds + (1 - best_alpha) * val_predictions
test_ensemble_preds = best_alpha * xgb_test_preds + (1 - best_alpha) * test_predictions

# Compute RMSE
val_rmse = np.sqrt(mean_squared_error(y_val, val_ensemble_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_ensemble_preds))

print(f"Validation RMSE (Ensemble): {val_rmse:.4f}")
print(f"Test RMSE (Ensemble): {test_rmse:.4f}")

Validation RMSE (Ensemble): 1.0687
Test RMSE (Ensemble): 1.0699
