# 1. Required libraries/modules

In [1]:
import pandas as pd
import numpy as np
import random
import time
from surprise import KNNBasic, KNNBaseline
from surprise import SVD, SVDpp
from surprise import NormalPredictor 
from surprise import BaselineOnly
from surprise import CoClustering
from surprise import Dataset
from surprise import Reader
from tabulate import tabulate
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy
from eval_metrics import precision_recall_at_k

# 2. Load data

In [2]:
full_df = pd.read_csv('data/full_data_cleaned.csv')
full_df.head(2)

Unnamed: 0,author_id,rating_x,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,...,price_usd_x,brand_id,ingredients,limited_edition,out_of_stock,sephora_exclusive,highlights,primary_category,num_rating,niche_product
0,1238130325,4,1.0,,0,0,0,3/12/23,I love this lippy…it makes my lips soft and al...,Yumm,...,24.0,6125,"['Diisostearyl Malate, Hydrogenated Polyisobut...",0,0,1,"['allure 2019 Best of Beauty Award Winner', 'C...",Skincare,1227,0
1,27991208736,3,1.0,0.0,1,1,0,3/9/23,ive tried my friend’s a few times and it does ...,overrated/overpriced,...,24.0,6125,"['Diisostearyl Malate, Hydrogenated Polyisobut...",0,0,1,"['allure 2019 Best of Beauty Award Winner', 'C...",Skincare,1227,0


# 3. Extract data for training

In [4]:
# Remove out of stock products
filtered_df = full_df[full_df['out_of_stock'] == 0]

# select relevant columns
rating_df = filtered_df[['author_id', 'product_id', 'rating_x', 'submission_time']]

# rename columns
rating_df.columns = ['userID', 'itemID', 'rating', 'timeStamp']

# convert "character" to "dateTime"
rating_df.loc[:,'timeStamp'] = pd.to_datetime(rating_df['timeStamp'], format='mixed')
rating_df.head()

Unnamed: 0,userID,itemID,rating,timeStamp
0,1238130325,P420652,4,2023-03-12 00:00:00
1,27991208736,P420652,3,2023-03-09 00:00:00
2,9467587295,P420652,5,2023-03-07 00:00:00
3,12367701277,P420652,5,2023-03-05 00:00:00
4,33161024868,P420652,3,2023-03-04 00:00:00


In [5]:
print(rating_df.dtypes)

userID       object
itemID       object
rating        int64
timeStamp    object
dtype: object


In [6]:
# set up data
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(rating_df.iloc[:,0:3], reader)

# 4. Final model

In [10]:
# create algorithms with best set parameters
svd = SVD(n_factors=150, n_epochs=50, lr_all=0.05, reg_all=0.01, biased=True, random_state=2024)
svdpp = SVDpp(n_factors=50, n_epochs=50, lr_bu=0.005, lr_bi=0.005, random_state=2024)
knnbasic = KNNBasic(k=50, sim_options={'name': 'pearson', 'min_support': 5, 'user_based': False}, random_state=2024)
knnbaseline = KNNBaseline(bsl_options={'method': 'sgd', 'reg': 0.01, 'learning_rate': 0.005, 'n_epochs': 50},
                          k=20,
                          sim_options={'name': 'pearson', 'min_support': 5, 'user_based': False},
                         random_state=2024)
coclustering = CoClustering(n_cltr_u=2, n_cltr_i=4, n_epochs=10, verbose=False, random_state=2024)
baseline = BaselineOnly(bsl_options={'method': 'sgd', 'reg': 0.2, 'learning_rate': 0.01, 'n_epochs': 50, 'random_state':2024})
random = NormalPredictor()

# list of recommenders
algorithms = [svd, svdpp, knnbasic, knnbaseline, coclustering, baseline, random]

# name the recommenders
titles = ('SVD', 'SVDpp', 'KNNBasic', 'KNNBaseline', 'CoClustering', 'Baseline', 'Random')

# split data into train and test set
trainset, testset = train_test_split(data, test_size=0.3, random_state=2024)

# create a blank list to store result
table = []    

# for loop to evaluate each algorithm
for algorithm in algorithms:
    start_time = time.time()  # start time
    algorithm.fit(trainset) 
    predictions = algorithm.test(testset)
    end_time = time.time()  # end time
    fit_time = round(end_time - start_time, 3)  # fit time
    
    rmse = round(accuracy.rmse(predictions, verbose=False), 3)
    mae = round(accuracy.mae(predictions, verbose=False), 3)
    fcp = round(accuracy.fcp(predictions, verbose=False), 3)
    p, r = precision_recall_at_k(predictions, k=5, threshold=4)
    precision = round(p, 3)
    recall = round(r, 3)
    
    table.append([str(algorithm).split(' ')[0].split('.')[-1],
                rmse,
                mae,
                fcp,
                precision,
                recall,
                fit_time])

# change column name
header = ['Recommenders',
          'Pred Accuracy [RMSE]',
          'Pred Accuracy [MAE]',
          'Ranking Quality [FCP]',
          'Precision',
          'Recall',
          'Training time [sec]']

# view the result table
print(tabulate(table, header, tablefmt='pipe'))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
| Recommenders    |   Pred Accuracy [RMSE] |   Pred Accuracy [MAE] |   Ranking Quality [FCP] |   Precision |   Recall |   Training time [sec] |
|:----------------|-----------------------:|----------------------:|------------------------:|------------:|---------:|----------------------:|
| SVD             |                  0.857 |                 0.552 |                   0.52  |       0.764 |    0.75  |                 7.585 |
| SVDpp           |                  0.879 |                 0.58  |                   0.515 |       0.769 |    0.751 |                11.206 |
| KNNBasic        |                  0.976 |                 0.575 |                   0.556 |       0.793 |    0.799 |                 0.957 |
| KNNBaseline     |                  0.94  |                 0.5