In [1]:
import pandas as pd 
import numpy as np 
from IESEGRecSys import eval
from IESEGRecSys.model import ContentBased
from surprise import KNNBasic, Reader, Dataset, SVD, CoClustering, BaselineOnly, accuracy
from surprise.model_selection import GridSearchCV, cross_validate, KFold
from sklearn.decomposition import PCA
# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\warfaoui\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\warfaoui\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Preparation

After importing all the libraries that we are going to need through the whole process, we start by importing our three datasets which are:  
- **Meta** dataset: it contains additional item's data (title, description, image_url)  
- **Train** dataset: it contains user-item ratings, including review text and additional user data  
- **Test** dataset: it contains user-item ratings to predict  

Once this done, I checked the cleanliness of our train data by:  
- Checking the range values of the ratings that needs to be integers in the range [1,5].  
- Checking the mission values especially for the columns: `userID`, `asin` and `overall`
- Checking the data types per column.  

And finally, we create a **Reader** object, with the attribute `rating_scale` which is a tuple with the lowest and highest possible range. It’s important to get this parameter right, otherwise parts of your data will be ignored. In our case, we have a minimum rating of 1.0 and a maximum rating of 5.0.  

Next, I transform our train dataset into a **Surprise** format where it will become a sparse matrix, with the **users / items** are the **rows / columns**, and the **ratings** are elements in this matrix. 
Since I am going to use cross validation, I don't need to split my data, but instead, I will be using my whole dataset for training and cross-validate each time for testing.



In [2]:
# Read datasets 
meta = pd.read_csv(".Data/metadata.csv")
train = pd.read_csv(".Data/train.csv")
test = pd.read_csv(".Data/test_students.csv")

In [28]:
meta["main_cat"].value_counts()

Pet Supplies                   2510
Amazon Home                      29
Tools & Home Improvement          7
Sports & Outdoors                 5
Grocery                           5
Health & Personal Care            5
Industrial &amp; Scientific       4
Automotive                        3
Sports &amp; Outdoors             2
Cell Phones & Accessories         2
Toys & Games                      2
Industrial & Scientific           2
Baby                              1
Name: main_cat, dtype: int64

In [4]:
# check values for the ratings 
train["overall"].value_counts()

5.0    107620
4.0     23909
3.0     14029
1.0      8474
2.0      7721
Name: overall, dtype: int64

In [5]:
# check missing values
train.isna().sum()

userID             0
overall            0
asin               0
vote          145992
reviewText         2
summary            1
style          29641
image         157207
dtype: int64

In [6]:
# check columns types 
train.dtypes

userID          int64
overall       float64
asin           object
vote           object
reviewText     object
summary        object
style          object
image          object
dtype: object

In [7]:
test["overall"]=0.0
test.head()

Unnamed: 0,ID,userID,asin,overall
0,21069B00BFK2B24,21069,B00BFK2B24,0.0
1,3506B00ZK0Y7R2,3506,B00ZK0Y7R2,0.0
2,21907B0002AQPA2,21907,B0002AQPA2,0.0
3,14092B0002DHXX2,14092,B0002DHXX2,0.0
4,3085B0006VB3SQ,3085,B0006VB3SQ,0.0


In [8]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(train[["userID","asin","overall"]], reader)
df_train = data.build_full_trainset()
df_test = list(test[["userID","asin","overall"]].itertuples(index = False , name = None))

## Collaborative filtering

In the first part of this project, I am going to apply **collaborative filtering** models to our dataset with the aim of finding similarities between **items / users** through commonly **rated** items.  
Under this method, I will opt for these two main approaches:  
- **Memory-based** models that calculate the similarities between **users / items** based on **user-item rating pairs** (I will use the `KNNBAsic`).  
- **Model-based** models that use machine learning algorithms to estimate the ratings (I will use `SVD`, `ALS` and `CoClustering`).  

In order to do the benchmarking of these models, and come out with the model with the best performance, I will opt for ***Grid Search**.  

`GridSearchCV` is an algorithm that we can import from `sklearn.model_selection` library, that automatically finds the best parameters for a particular model, what we call **hyperparameter tuning**.  

To implement this algorithm, I started by creating a **dictionary** of all the parameters and their corresponding set of values that you want to test for best performance. 
Once the parameter dictionary is created, the next step is to create `GridSearchCV` for our model. I included **the model function** name as a estimator parameter. The **param_grid** as dictionary parameter, **the performance metrics** which I chose to be **RMSE** as a scoring parameter, and finally I precised the number of folds for cross validation for the cv parameter, which is 5 in this case.

After fitting the model, I checked which were the parameters that return the highest accuracy. For the cases of where one of the parameters highest value was chosen in the best combination of parameters, I tried more values for that paremter, to see if performance further increases. 
After checking this, I printed the RMSE corresponding to the best combination of parameters that I am going to use later on to compare models to each others. 

#### Memory based models

I started by working on the **user-based** collaborative filtering. It recommends items by finding similar users to the active user, to whom I am trying to recommend an Amazon product. 
To apply this, I will be using the user-based **Nearest Neighbor Algorithm** (KNN) that has 2 hyperparameters that I am going to tune:  
- The **k parameter** : it is the highest number of similar users we want the algorithm to consider  
- The **similarity option parameter** : it is used to specify the method of similarity calculation and it has a dictionnary format with the following keys:  
    1. `user_based`:  it is used to specify whether the similarity will be calculated based on items or users, in this case I will put ``TRUE``
    2. `name`: it is used to specify the type of similarity formula to be applied (**cosine**, **MSD**, **Pearson**)

The same algorithm will be applied for **item-based** collaborative filtering, that is instead of focusing on users, it will be used to focus on what products are more similar to what the user *i* already likes.  
The only small change that is going to occur is to change the `user_based` from ``TRUE`` to `FALSE`. 

In [9]:
# User model hyperparameter tuning 
param_grid = {'k': [10,15, 20, 25, 30, 40, 50],
              'sim_options': {'name': ["pearson",'cosine'],
                              'user_based': [True]}
              }
knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5)
knnbasic_gs.fit(data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [64]:
# display the best parameter 
print(knnbasic_gs.best_params)

{'rmse': {'k': 30, 'sim_options': {'name': 'pearson', 'user_based': True}}}


In [65]:
# display the best parameters rmse 

print(knnbasic_gs.best_score)

{'rmse': 1.1617168955079455}


In [44]:
pd.DataFrame(knnbasic_gs.best_score.values())[0]

0    1.161717
Name: 0, dtype: float64

In [None]:
# item model hyperparameter tuning 
param_grid = {'k': [5,10,15,20,25],
              'sim_options': {'name': ["pearson", 'cosine'],
                              'user_based': [False]}
              }
knnbasic_gs_i = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=5)
knnbasic_gs_i.fit(data)

In [66]:
# display the best parameter 
print(knnbasic_gs_i.best_params)

{'rmse': {'k': 10, 'sim_options': {'name': 'pearson', 'user_based': False}}}


In [67]:
# display the best parameters rmse 
print(knnbasic_gs_i.best_score)

{'rmse': 1.1824543498117397}


#### Model based models

In this part of the project, I am going to use another subgroup of collaborative filtering models which is **model-based**. Unlike **memory-based** models, These ones use machine learning algorithms.  
In the upcoming steps, I am going to concentrate on the `SVD`, `ALS` and `CoClustering` methods.

**Singular value decomposition (SVD)** is a matrix factorisation technique, which reduces the number of features of a dataset by reducing the space dimension from N-dimension to K-dimension (where K< N). In our context, it aims to provide users with Amazon products’ recommendation from the latent features of item-user matrices. The code would show you how to use the SVD latent factor model for matrix factorization. 
The hyperparameters I considered for this method are:  

1. `n_factors`: this parameter determines how many latent factors the model will try to find.  
2. `n_epochs`: this parameter determines how many times the gradient descent calculations are repeated.  
3. `lr_all`: it is the learning rate factor for all of the parameters. These are the step sizes the model will use to minimise the cost function.  
4. `reg_all`: it is regularisation factor for all of the parameters.  
5. `biased`: this parameter determines whether to choose biased or unbiased version of the algorithm.


In [10]:
train[["userID","asin","overall"]].shape

(161753, 3)

In [60]:
# SVD model hyperparameter tuning
param_grid = {'n_factors':[300],'n_epochs': [150], 'lr_all':[0.01],'biased':[True],
              'reg_all': [0.02]}
svd_gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=200,refit=True)
svd_gs.fit(data)

In [61]:
# display the best parameter 
print(svd_gs.best_params)

{'rmse': {'n_factors': 300, 'n_epochs': 150, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.02}}


In [24]:
# display the best parameters rmse {'rmse': {'n_factors': 300, 'n_epochs': 100, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.02}}
print(svd_gs.best_score)

{'rmse': 1.0495245536181195}


In [19]:
# display the best parameters rmse {'rmse': {'n_factors': 250, 'n_epochs': 70, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.02}}
print(svd_gs.best_score)

{'rmse': 1.053688261160976}


In [57]:
# display the best parameters rmse {'rmse': {'n_factors': 300, 'n_epochs': 150, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.2}}
print(svd_gs.best_score)

{'rmse': 1.0449171980163638}


In [53]:
# display the best parameters rmse {'rmse': {'n_factors': 150, 'n_epochs': 150, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.01}}
print(svd_gs.best_score)

{'rmse': 1.0546841214938782}


In [62]:
# display the best parameters rmse{'rmse': {'n_factors': 300, 'n_epochs': 150, 'lr_all': 0.01, 'biased': True, 'reg_all': 0.02}}
print(svd_gs.best_score)

{'rmse': 1.043599570554015}


 **The alternating least squares (ALS)** is a matrix factorization algorithm that uses Alternating Least Squares with Weighted-Lamda-Regularization (ALS-WR). It factors the user to item **matrix A** into the user-to-feature **matrix U** and the item-to-feature **matrix M**. It runs the ALS algorithm in a parallel fashion, and tries to find optimal factor weights to minimize the least squares between predicted and actual ratings.  
 Since the ALS uses baslines in the minimization objective function, I am going to use the `BaselineOnly` method from Surpise library, and I am going to configure it using these parameters:  
 1. `reg_i`: The regularization parameter for products.  
 2. `reg_u`: The regularization parameter for users.
 3. `n_epochs`: The number of iteration of the ALS procedure. 

In [18]:
# ALS model hyperparameter tuning
param_grid = {'bsl_options': 
                {'reg_i':[5,10,15,20,25,30], 'reg_u':[5,10,15,20,25,30], 'n_epochs': [10,20,30,40,50,100]}}
als_gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=5)
als_gs.fit(data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimati

In [70]:
# display the best parameter 
print(als_gs.best_params)

{'rmse': {'bsl_options': {'reg_i': 10, 'reg_u': 5, 'n_epochs': 40}}}


In [71]:
# display the best parameters rmse 
print(als_gs.best_score)

{'rmse': 1.0740294617942812, 1: {'rmse': 1.269223347945972}}


**Co-clustering** is a special case of clustering, where it is done simultaneously for the rows and columns of the matrix. It is basically a method of co-grouping users and items simultaneously based on similarity of their **pairwise interactions**.  In order to configure this model, I chose to tune the following parameters:  
1. `n_cltr_u`: it represents the number of user clusters  
2. `n_cltr_i`: it represents the number of products clusters  
3. `n_epochs`: it determines the number of iteration of the optimization loop.

In [21]:
# CoClustering model hyperparameter tuning
param_grid = {'n_cltr_u':[10,15,20,25,30], 'n_cltr_i':[10,15,20,25,30],'n_epochs': [10,20,30,40,50,100]}
clust_gs = GridSearchCV(CoClustering, param_grid, measures=['rmse'], cv=5)
clust_gs.fit(data)

In [72]:
# display the best parameter 
print(clust_gs.best_params)

{'rmse': {'n_cltr_u': 10, 'n_cltr_i': 10, 'n_epochs': 10}}


In [73]:
# display the best parameters rmse 
print(clust_gs.best_score)

{'rmse': 1.269223347945972}


In [93]:
#Benchmark different models
models = {"UB_30":knnbasic_gs, "IB_10":knnbasic_gs_i, "SVD_50":svd_gs,"Clust_10":clust_gs}
bench = pd.concat([pd.DataFrame.from_dict(mod.best_score,orient='index') for mod in models.values()], axis=1,)
bench.columns = list(models.keys())
bench

Unnamed: 0,UB_30,IB_10,SVD_50,Clust_10
rmse,1.161717,1.182454,1.06368,1.269223


In [63]:
test["overall"] = pd.DataFrame(svd_gs.test(df_test))["est"]

In [64]:
results = test[["ID","overall"]]
results.to_csv('sample_sumbission_6.csv', index=False)

In [31]:
train=train[["userID","asin","overall"]]

In [32]:
meta= meta.drop_duplicates(subset=['asin'])
meta= meta[["asin",'description']]

In [33]:
# Tokenize, case conversion & only alphabetic
tokens = meta['description'].apply(lambda txt: [word.lower() for word in word_tokenize(str(txt)) if word.isalpha()])

In [34]:
# setup stop words list
stop_words = stopwords.words('english')
stop_words.append('nan')

stemmer = SnowballStemmer("english")

# remove stopwords
# stem
token_stem = tokens.apply(lambda lst_token: [stemmer.stem(tok) for tok in lst_token if tok not in stop_words and len(tok) > 2])

In [35]:
# TFIDF vectorizer
tfidf = TfidfVectorizer(min_df=5)

# apply tf-idf vectorizer -> document-term-matrix in sparse format
dtm = tfidf.fit_transform([" ".join(x) for x in token_stem])


df_dtm = pd.DataFrame(dtm.toarray(), columns=tfidf.get_feature_names_out(), index=meta["asin"])
df_dtm.shape

(2307, 2874)

In [100]:
train = train[train["asin"].isin (df_dtm.index.values)]

In [36]:
reader = Reader(rating_scale = (1,5))
dataset = Dataset.load_from_df(train, reader)

In [46]:
cb = ContentBased(NN=50)

In [None]:
kf = KFold(n_splits=100)
rmse=[]
for trainset, testset in kf.split(dataset):

    # train and test algorithm.
    cb.fit(df_dtm)
    cb.fit_ratings(trainset)
    predictions = cb.test(testset)

    # Compute and print Root Mean Squared Error
    rmse.append(accuracy.rmse(predictions, verbose=True))

In [49]:
pd.DataFrame(rmse,columns=["CB"]).mean()

CB    1.129889
dtype: float64

In [43]:
test["overall"] = pd.DataFrame(cb.test(df_test))["est"]

In [44]:
results = test[["ID","overall"]]
results.to_csv('.Data/sample_sumbission_4.csv', index=False)