In [1]:
# Import the libraries 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import os
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV as sk_GridSearch
from sklearn.svm import SVR
from scipy.sparse.linalg import svds
from implicit.als import AlternatingLeastSquares

from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV, train_test_split
from surprise import KNNWithMeans, KNNBasic, SVD, SVDpp, NMF
from surprise import accuracy
from loguru import logger

plt.style.use("Solarize_Light2")

In [2]:
# Load the datasets
restaraunts = pd.read_csv('../data/clean/restaraunts_labels.csv', usecols=lambda column: column not in ['Unnamed: 0'])
users = pd.read_csv('../data/clean/users_commmunities.csv', usecols=lambda column: column not in ['Unnamed: 0'])
ratings = pd.read_csv('../data/clean/ratings.csv', usecols=lambda column: column not in ['Unnamed: 0'])

Since we have 0 in the ratings column and that corresponds to least rating we will add 1 to preserve the ordinanality 

In [3]:
ratings['rating'] = ratings['rating'] + 1
ratings['food_rating'] = ratings['food_rating'] + 1
ratings['service_rating'] = ratings['service_rating'] + 1

#### We will start with building User & Item Based Collaborative Filtering Recommender system 
#### Data Processing 

In [None]:
min_rating = ratings['rating'].min()
max_rating = ratings['rating'].max()
reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(ratings[['userID', 'placeID', 'rating']], reader=reader)
trainset, testset = train_test_split(data, test_size=0.2)

In [None]:
print(f"Number of Users in TrainSet : {trainset.n_users}")
print(f"Number of Items in TrainSet : {trainset.n_items}")

In [None]:
# Intiliaze Empty Dictionary for capturing performance metrics of different algos
model_performance = {}

In [None]:
item_mapper = restaraunts.set_index('placeID').to_dict()['name']
item_list = ratings['placeID'].unique().tolist()

### Function to recommend restaraunts 

In [None]:
def recommend_places(user_id, item_list, item_mapper, model, topk):
    # Predict ratings for all items in the item_list for the given user
    item_ratings = [model.predict(user_id, item).est for item in item_list]
    
    # Create a DataFrame with Restaurant ID and Predicted Ratings
    pred_df = pd.DataFrame({
        "Restaraunt ID": item_list,
        "rating": item_ratings
    })
    
    # Map Restaurant IDs to their names using the item_mapper
    pred_df['Restaraunt Name'] = pred_df['Restaraunt ID'].map(item_mapper)
    
    # Sort the DataFrame by predicted ratings in descending order
    pred_df = pred_df.sort_values(by='rating', ascending=False)
    
    # Return only the top k recommended places
    top_recommendations = pred_df.head(topk)
    
    return top_recommendations[['Restaraunt ID', 'Restaraunt Name']]

In [None]:
# Initialsize an empty dict for optimised model performance
optimized_model_performance = {}

### User Based/Item Based Collaborative Filtering

Optimising using hyperparamter tuning 

#### KNNBasic 

In [None]:
param_grid = {
    'k': [10, 20, 30, 40, 50],  # Number of neighbors
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'],  # Similarity measures
        'user_based': [True, False]  # User-based CF or Item-based CF
    }, 
    'verbose' : [False]
}

# Configure GridSearchCV
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3)

# Perform grid search
gs.fit(data)

In [None]:
# Checking Best Score and Parameters using RMSE as the performance Criteria
print("Performance Metrics : RMSE")
print("RMSE:",gs.best_score['rmse'])
print("Best Params:",gs.best_params['rmse'])
print()

In [None]:
# Fitting the best model on Train Data and Testing on Test Data 
knn_basic_params = {'k': 40, 'sim_options': {'name': 'msd', 'user_based': False}, 'verbose': False}
knn_basic_final = KNNBasic(**knn_basic_params)

knn_basic_final.fit(trainset)

knn_basic_final_predictions = knn_basic_final.test(testset)

print("Model : KnnBasic")
knn_basic_final_rmse = accuracy.rmse(knn_basic_final_predictions)
knn_basic_final_mae  = accuracy.mae(knn_basic_final_predictions)

optimized_model_performance['knn_basic'] = {
    'RMSE' : knn_basic_final_rmse,
    'MAE' : knn_basic_final_mae
}

In [None]:
recommend_places(user_id='U1081',
                 item_list=item_list,
                 item_mapper=item_mapper,
                 model=knn_basic_final,
                 topk=10)

#### KNNMeans

In [None]:
# Define the parameter grid
param_grid_knn_means = {
    'k': [10, 20, 30, 40, 50],  # number of neighbors
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'],  # similarity measure
        'user_based': [True, False]  # user-based or item-based CF,
    },
    'verbose' : [False]
}

# Configure and run the grid search
gs_knn_means = GridSearchCV(KNNWithMeans, param_grid_knn_means, measures=['rmse'], cv=3)
gs_knn_means.fit(data)

In [None]:
# Fetching Best RMSE and Params
best_rmse_knn_means = gs_knn_means.best_score['rmse']
best_params_rmse_knn_means = gs_knn_means.best_params['rmse']
print(f"Performance Metrics : RMSE")
print(f"RMSE : {best_rmse_knn_means}")
print(f"Best Params : {best_params_rmse_knn_means}")

In [None]:
# Fitting the best model on Train Set and testing on Test Set 
best_params_knn_means = {'k': 50, 'sim_options': {'name': 'msd', 'user_based': False}, 'verbose': False}
knn_means_final = KNNWithMeans(**best_params_knn_means)

# Fitting the final model on train set 
knn_means_final.fit(trainset)

# Getting predictions on test set 
knn_means_final_predictions = knn_means_final.test(testset)

# Calculating Performance Metrics
print("Model Name : KNNWithMeans")
knn_means_final_rmse = accuracy.rmse(knn_means_final_predictions)
knn_means_final_mae = accuracy.mae(knn_means_final_predictions)

optimized_model_performance['KnnWithMeans'] = {
    "RMSE" : knn_means_final_rmse, 
    "MAE" : knn_means_final_mae
}

In [None]:
recommend_places(user_id='U1081',
                 item_list=item_list,
                 item_mapper=item_mapper,
                 model=knn_means_final,
                 topk=10)

### Matrix Factorisation Collaborative Filtering 

### SVD

In [None]:
# Define the parameter grid
param_grid_svd = {
    'n_factors': [50, 100, 150, 200],  # Number of factors
    'lr_all': [0.002, 0.005, 0.007],  # Learning rate
    'reg_all': [0.02, 0.05, 0.1]  # Regularization term
}

# Configure GridSearchCV
gs_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse'], cv=3)

# Fit GridSearchCV
gs_svd.fit(data)

In [None]:
best_rmse_svd = gs_svd.best_score['rmse']
best_params_svd = gs_svd.best_params['rmse']
print("Performance Metrics : RMSE")
print(f"RMSE : {best_rmse_svd}")
print(f"Best Params : {best_params_svd}")

In [None]:
# Fitting the best svd model on whole training data and getting predictions on test set 
svd_final = SVD(**best_params_svd)

# Training on Trainset 
svd_final.fit(trainset)

# Testing on test set 
svd_final_predictions = svd_final.test(testset)

print("Model Name : SVD")
svd_final_rmse = accuracy.rmse(svd_final_predictions)
svd_final_mae = accuracy.mae(svd_final_predictions)

optimized_model_performance['SVD'] = {
    "RMSE" : svd_final_rmse, 
    "MAE" : svd_final_mae
}

In [None]:
recommend_places(user_id='U1081',
                 item_list=item_list,
                 item_mapper=item_mapper,
                 model=svd_final,
                 topk=10)

### NMF

In [None]:
# Define the parameter grid
param_grid_nmf = {
        'n_factors': [50, 100, 150, 200],  # Number of latent factors
        'n_epochs': [50, 100],  # Number of epochs
        'reg_pu': [0.06, 0.1],  # Regularization term for users
        'reg_qi': [0.06, 0.1]  # Regularization term for items
}

# Configure GridSearchCV
gs_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse'], cv=3)

# Fit GridSearchCV
gs_nmf.fit(data)

In [None]:
best_rmse_nmf = gs_nmf.best_score['rmse']
best_params_nmf = gs_nmf.best_params['rmse']
print("Performance Metrics : RMSE")
print(f"RMSE : {best_rmse_nmf}")
print(f"Best Params : {best_params_nmf}")

In [None]:
## Fitting the best NMF Model with best params
nmf_final = NMF(**best_params_nmf)

# Fit the model on training set 
nmf_final.fit(trainset)

# Get predictions on test set 
nmf_final_predictions = nmf_final.test(testset)

print("Model Name : NMF")
nmf_final_rmse = accuracy.rmse(nmf_final_predictions)
nmf_final_mae = accuracy.mae(nmf_final_predictions)

optimized_model_performance['NMF'] = {
    "RMSE" : nmf_final_rmse,
    "MAE"  : nmf_final_mae
}

In [None]:
recommend_places(user_id='U1081',
                 item_list=item_list,
                 item_mapper=item_mapper,
                 model=nmf_final,
                 topk=10)

### Performance Comparison 

In [None]:
opt_performance_df = pd.DataFrame(optimized_model_performance).T
opt_performance_df = opt_performance_df.sort_values(by=['RMSE', 'MAE'], ascending=[False, False])
opt_performance_df.head()

opt_performance_df = opt_performance_df.reset_index()
opt_performance_df.rename(columns={'index' : 'Model'}, inplace=True)
opt_performance_df.head()

In [None]:
# Now, let's use seaborn to plot the model performance
sns.set(style="whitegrid")
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plotting RMSE values
sns.barplot(x="Model", y="RMSE", data=opt_performance_df, color='b', ax=ax1)
ax1.set_ylabel('RMSE', color='b')

# Creating a twin Y-axis to plot MAE values
ax2 = ax1.twinx()
sns.lineplot(x="Model", y="MAE", data=opt_performance_df, marker='o', color='r', ax=ax2)
ax2.set_ylabel('MAE', color='r')

# Setting the plot title and labels
plt.title('Comparison of Model Performance (RMSE and MAE)')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()

We can see that KNNBasic Item Based Collaborating Filterting performs the best 

In [4]:
ratings.head(2)

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,3,3,3
1,U1077,135038,3,3,2


In [7]:
users.columns

Index(['userID', 'latitude', 'longitude', 'smoker', 'drink_level',
       'dress_preference', 'ambience', 'transport', 'marital_status', 'hijos',
       'interest', 'personality', 'activity', 'budget', 'Rcuisine_str',
       'Upayment_str', 'age_group', 'cuisine_list', 'community'],
      dtype='object')

In [8]:
restaraunts.columns

Index(['placeID', 'latitude', 'longitude', 'name', 'state', 'alcohol',
       'smoking_area', 'dress_code', 'accessibility', 'price', 'Rambience',
       'franchise', 'area', 'other_services', 'Rcuisine_str', 'Rpayment_str',
       'parking_lot_str', 'cluster'],
      dtype='object')