In [None]:
from fastai.tabular.all import *
from fastai.collab import *
from fastai.data.transforms import *
import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from functools import partial

Developing a recommendation system for a **NO-Q** app, we were focusing on two major scenarios: new users and users who have purchased and rated products in the past.
For users who have purchased and rated products previously, we use their purchase history and ratings to generate personalized recommendations based on collaborative filtering.

**Data**

In user-based filtering, the algorithm identifies users with similar preferences and recommends items that these users have previously enjoyed. To illustrate this approach, we generated a dataset containing 700 users and 441 items rated by users on a scale from 0 to 1 (like/dislike)

The dataset is generated in such a way that some users rated many items while most rated only a few, with some items remaining unrated - a realistic representation of a typical data

In [None]:
#Load data
df = pd.read_csv('User ratings.csv')
cocktail_strings = df['title'].unique()

In [None]:
# Create mapping of cocktail strings to integer IDs
cocktail_id_map = {s: i for i, s in enumerate(cocktail_strings)}

In [None]:
# Replace cocktail strings with their IDs in the dataset
df['cocktail'] = df['title'].map(cocktail_id_map)
df = df[['user','cocktail','rating','title']]
ratings = df

**Optimization with grid search**

The hyperparameters being varied here include learning rate ('lr'), weight decay ('wd'), number of factors ('n_factors'), batch size ('bs'), number of epochs ('epochs'), and number of layers in the neural network ('layers')

In [None]:
# Define hyperparameter grid
param_grid = {
    'lr': [1e-3, 5e-3, 1e-2],
    'wd': [0.01, 0.1, 0.2],
    'n_factors': [20, 50, 100],
    'bs': [32, 64, 128],
    'epochs': [3, 5],
    'layers': [
        [100, 50],
        [200, 100],
        [110,100],
    ]
}

In [None]:
# Function to train model with given hyperparameters and return accuracy
def train_model(ratings, param, seed=42):
    torch.manual_seed(seed)
    dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=param['bs'])
    learn = collab_learner(dls, n_factors=param['n_factors'], y_range=(0, 1.0), layers=param['layers'], loss_func=BCEWithLogitsLossFlat())
    learn.fit_one_cycle(param['epochs'], param['lr'], wd=param['wd'])
    
    preds, targets = learn.get_preds()
    mse = ((preds - targets)**2).mean()
    return mse

The grid search cell is commented out for better readability.


**Best Parameters**: {'bs': 64, 'epochs': 5, 'layers': [200, 100], 'lr': 0.01, 'n_factors': 20, 'wd': 0.01}, Best MSE: 0.2578418552875519

In [None]:
# Grid search
#best_mse = 1
#best_params = None
#for param_set in ParameterGrid(param_grid):
    #mse = train_model(ratings, param_set)
    #print(f"Parameters: {param_set}, MSE: {mse}")
    #if mse < best_mse:
        #best_mse = mse
        #best_params = param_set

#print(f"Best Parameters: {best_params}, Best MSE: {best_mse}")

**The Mean Squared Error (MSE)** metric is commonly used in collaborative filtering to evaluate the accuracy of predicted ratings. It measures the average of the squared differences between the predicted and actual ratings for a set of users and items. An MSE value of 0 indicates a perfect match between the predicted and actual ratings, while higher values indicate larger prediction errors. An MSE value of around 0.25 is generally considered acceptable in collaborative filtering, as it indicates a reasonable level of accuracy in the predicted ratings. 

In [None]:
# Train the model with the best hyperparameters
best_dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=best_params['bs'])
best_learn = collab_learner(best_dls, n_factors=best_params['n_factors'], layers = best_params['layers'], y_range=(0, 1.0), loss_func=BCEWithLogitsLossFlat())
best_learn.fit_one_cycle(best_params['epochs'], best_params['lr'], wd=best_params['wd'])


epoch,train_loss,valid_loss,time
0,0.725572,0.719663,00:00
1,0.717802,0.716711,00:00
2,0.689785,0.715775,00:00
3,0.652152,0.715878,00:00
4,0.633382,0.71601,00:00


In [None]:

# Menu
items_df = pd.DataFrame({'user': 10,
                      'title': ['Bluebird', 'Boston Sour', 'Irish Spring', 'Zipperhead', 'ACID', 'Spritz', 'Kir Royale', 'Affair', 'Lord and Lady', 'Van Vleet']})  # positions on the menu


In [None]:

# Learner to get predictions for the items (the probability of liking the item)
preds, _ = best_learn.get_preds(dl=best_dls.test_dl(items_df))

# Get the index of the item with the highest predicted probability
most_likely_item_idx = np.argmax(preds)
most_likely_item_idx = most_likely_item_idx.item()
most_likely_item_title = items_df.loc[most_likely_item_idx, 'title']
print(f"Recommended for you: {most_likely_item_title}")

g = ratings.groupby('title')['rating'].count()
top_cocktails = g.sort_values(ascending=False).index.values[:1000]
top_cocktails[:10]

cocktail_bias = best_learn.model.bias(top_cocktails, is_item=True)
cocktail_bias.shape

mean_ratings = ratings.groupby('title')['rating'].mean()
cocktail_ratings = [(b, i, mean_ratings.loc[i]) for i, b in zip(top_cocktails, cocktail_bias)]

item0 = lambda o: o[0]
items_titles = items_df['title'].unique()

# Filter cocktail_ratings to only include ratings for items on the menu
filtered_cocktail_ratings = [(b, i, mean_ratings.loc[i]) for b, i, r in cocktail_ratings if i in items_titles]
list = sorted(filtered_cocktail_ratings, key=lambda o: o[0], reverse=True)[0:1]
print(f"Top drink on the menu: {list[0][1]}")

Recommended for you: Irish Spring
Top drink on the menu: Zipperhead
