# MANTILLA Omar Augusto - INDIVIDUAL PROJECT

In [81]:
# Loading the necessary Libraries.
import pandas as pd 
import numpy as np 
import surprise
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from numpy.linalg                import norm
from surprise                    import accuracy
from surprise                    import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, BaselineOnly
from surprise                    import CoClustering
from sklearn.model_selection     import train_test_split
from surprise                    import Dataset, Reader, KNNBasic, accuracy, KNNWithMeans, KNNWithZScore, KNNBaseline
from IESEGRecSys                 import eval
from sklearn.metrics             import ndcg_score, roc_auc_score, roc_curve
from surprise.model_selection    import GridSearchCV
from sklearn.experimental        import enable_halving_search_cv
from sklearn.model_selection     import HalvingGridSearchCV
from sklearn.neighbors           import NearestNeighbors
from surprise.model_selection    import cross_validate
from plotly.offline              import init_notebook_mode, plot, iplot
from surprise.prediction_algorithms.co_clustering import CoClustering
import seaborn as sns
from IESEGRecSys import eval
from IESEGRecSys.model import ContentBased
init_notebook_mode(connected=True)

# NLP packages
import nltk # pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.pipeline import make_pipeline


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omantilla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\omantilla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## READING THE DATA

In [46]:
# Metadata
meta = pd.read_csv("../Data/metadata.csv")

# Train
train = pd.read_csv("../Data/train.csv")

# Test
test = pd.read_csv("../Data/test_students.csv")

## DATA EXPLORATION

### Meta Dataset.

In [47]:
# Checking the Meta Dataset. This is just to check how the dataset looks like.
meta.head()

Unnamed: 0,asin,category,description,title,image,feature,main_cat,price
0,1612231977,"['Pet Supplies', 'Dogs', 'Health Supplies', 'H...",['Dr. Rexy hemp oil has powerful anti-inflamma...,DR.REXY Hemp Oil for Dogs and Cats - 100% Orga...,['https://images-na.ssl-images-amazon.com/imag...,['Made strictly with organic derived ingredien...,Amazon Home,$19.90
1,6162622851,"['Pet Supplies', 'Dogs', 'Flea & Tick Control'...",['Kills and repels fleas and ticks for 8 conti...,Bayer Seresto Flea and Tick Collar for Dogs,['https://images-na.ssl-images-amazon.com/imag...,['Veterinarian-recommended ea and tick prevent...,Pet Supplies,$37.99
2,B00000IRNW,"['Pet Supplies', 'Dogs', 'Toys', 'Balls']",['100 Rokenbok balls. 50 large blue balls and ...,Rokenbok ROK Balls,[],"['Modular- interacts with all Rokenbok', 'Roke...",Toys & Games,
3,B00004T2WR,"['Pet Supplies', 'Dogs', 'Collars, Harnesses &...",['The Get Up \'n Go Discovery Center from Play...,Exclusive Playskool Electronic Activity Table,['https://images-na.ssl-images-amazon.com/imag...,"[""INTELLIGENT ANTI-INJURY CHIP: We always put ...",Pet Supplies,
4,B00005MF9U,"['Pet Supplies', 'Cats', 'Litter &amp; Housebr...",['LitterMaid LM900 self-cleaning cat litter bo...,LitterMaid LM900 Mega Self-Cleaning Litter Box,['https://images-na.ssl-images-amazon.com/imag...,['Automatically rakes waste into sealable cont...,Pet Supplies,


In [48]:
# Checking meta data shape to see the dataset dimensions.
meta.shape

(2577, 8)

In [49]:
# Check meta columns names.
meta.columns

Index(['asin', 'category', 'description', 'title', 'image', 'feature',
       'main_cat', 'price'],
      dtype='object')

### Train Dataset.

In [50]:
# Checking the train dataset. This is just to check how the dataset looks like.
train.head()

Unnamed: 0,userID,overall,asin,vote,reviewText,summary,style,image
0,13527,5.0,B0002565TI,,"These filters used to be sold at PetCo, but no...",Great Place to Get Filte-rs,,
1,14608,2.0,B0002H3ZLM,,Did not work for my large- does. Returned it.,T#wo Stars,"{'Size:': ' LARGE 60-130 LBS.', 'Color:': ' BL...",
2,15536,5.0,B0009YD8OC,,I was pretty skeptical that this would be easy...,stops pulling in a 6 month 60{ pound pup great!,,
3,12868,5.0,B001VPA9OK,,Works great for groom-ing my dog. A must have.,Five /Stars,{'Color:': ' Silver'},
4,181,5.0,B000K67UF2,,Great cage for budgies! I cant say enough marv...,Great cage for budg{ies,"{'Size:': ' Medium', 'Pattern:': ' MO2 Cage'}",


In [51]:
# Checking train data shape to see the dataset dimensions.
train.shape

(161753, 8)

Train data set is bigger than test set

In [52]:
# Check train columns names.
train.columns

Index(['userID', 'overall', 'asin', 'vote', 'reviewText', 'summary', 'style',
       'image'],
      dtype='object')

In [53]:
# Chech the summary statistics of the train overall.
train[['overall']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
overall,161753.0,4.325972,1.143786,1.0,4.0,5.0,5.0,5.0


In [54]:
# Check unique users, overall scores and items of the train set
print("Total data ")
print("#"*100)
print("\nTotal No of ratings :",train.shape[0])
print("Total No of Users   :", len(np.unique(train.userID)))
print("Total No of products  :", len(np.unique(train.asin)))


Total data 
####################################################################################################

Total No of ratings : 161753
Total No of Users   : 22181
Total No of products  : 2310


## Check the rating distribution in the train dataset

In [55]:
# Get data
data = train['overall'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / train.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))

# Create layout
layout = dict(title = 'Distribution Of Train Item Ranking'.format(train.shape[0]),
              xaxis = dict(title = 'Overall'),
              yaxis = dict(title = 'Count'))
              
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

More than 80% of the reviews of the training set are above 4 in the overall score

In [56]:
# Check missing values of the train set.
print('Number of missing values across columns: \n',train.isnull().sum())

Number of missing values across columns: 
 userID             0
overall            0
asin               0
vote          145992
reviewText         2
summary            1
style          29641
image         157207
dtype: int64


According to this a for the sake of the model fitting the variables "vote", "reviewText", "summary", "style" and "image" should be removed.



In [57]:
# Selecting Columns.
f_col =  ['userID', 'asin','overall']
train = train[f_col]

In [58]:
# Checking train set again.
train.head()

Unnamed: 0,userID,asin,overall
0,13527,B0002565TI,5.0
1,14608,B0002H3ZLM,2.0
2,15536,B0009YD8OC,5.0
3,12868,B001VPA9OK,5.0
4,181,B000K67UF2,5.0


### Test Dataset.

In [59]:
# Checking the test dataset how it looks like
test.head()

Unnamed: 0,ID,userID,asin
0,21069B00BFK2B24,21069,B00BFK2B24
1,3506B00ZK0Y7R2,3506,B00ZK0Y7R2
2,21907B0002AQPA2,21907,B0002AQPA2
3,14092B0002DHXX2,14092,B0002DHXX2
4,3085B0006VB3SQ,3085,B0006VB3SQ


In [60]:
# Check the shape of the test dataset to see its dimensions.
test.shape

(76043, 3)

Test set is smaller that train set

In [61]:
# Check test column names.
test.columns

Index(['ID', 'userID', 'asin'], dtype='object')

In [62]:
# Check missing values of the test set
print('Number of missing values across columns: \n',test.isnull().sum())

Number of missing values across columns: 
 ID        0
userID    0
asin      0
dtype: int64


In [18]:
# Check unique users, overall scores and items of the test set
print("Total data ")
print("#"*100)
print("\nTotal No of ratings :",test.shape[0])
print("Total No of Users   :", len(np.unique(test.userID)))
print("Total No of products  :", len(np.unique(test.asin)))

Total data 
####################################################################################################

Total no of ratings : 76043
Total No of Users   : 22181
Total No of products  : 2310


## Benchmarking The Algorithmns

In [19]:
# Set the reader with the rating min and max
reader = Reader(rating_scale=(1, 5))

# Surprise train and test set
df_train = Dataset.load_from_df(train, reader)
df_test = list(test.itertuples(index=False, name=None))

Cross Validation of the NMF(), SlopeOne(), SVDpp(), SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() algorithms

Code Reference: https://gist.github.com/susanli2016/e0cdcf1bca69a2b144fd8c04f30b522f

In [20]:
benchmark = []

# Iterate over all algorithms

algorithms = [NMF(), SlopeOne(), SVDpp(), SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]

print ("Attempting: ", str(algorithms), '\n\n\n')

for algorithm in algorithms:
    print("Starting: " ,str(algorithm))
    results2 = cross_validate(algorithm, df_train, measures=['RMSE'], cv=5, verbose=False)  # Perform cross validation    
    tmp = pd.DataFrame.from_dict(results2).mean(axis=0) # Get results & append algorithm name
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    print("Done: " ,str(algorithm), "\n\n")

print ('\n\tDONE\n')

Attempting:  [<surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fb8ec9a5550>, <surprise.prediction_algorithms.slope_one.SlopeOne object at 0x7fb8ec9a5670>, <surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7fb8ec9a5610>, <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fb8ec9a5640>, <surprise.prediction_algorithms.knns.KNNBaseline object at 0x7fb8ec9a56a0>, <surprise.prediction_algorithms.knns.KNNBasic object at 0x7fb8ec9a5700>, <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fb8ec9a5760>, <surprise.prediction_algorithms.knns.KNNWithZScore object at 0x7fb8ec9a57c0>, <surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x7fb8ec9a5820>, <surprise.prediction_algorithms.co_clustering.CoClustering object at 0x7fb8ec9a5880>] 



Starting:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fb8ec9a5550>
Done:  <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x7fb8

In [21]:
# Display the results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,1.074211,16.111819,0.420801
SVD,1.077433,6.263074,0.181698
BaselineOnly,1.081935,0.24426,0.168112
KNNBaseline,1.162602,29.03362,3.689657
CoClustering,1.164438,2.732519,0.175031
KNNWithMeans,1.187351,29.623568,3.709912
KNNWithZScore,1.196412,30.637548,3.964926
KNNBasic,1.21135,29.33333,3.526045
NMF,1.225557,6.739717,0.140204
SlopeOne,1.259075,0.352027,0.28828


The algorithms with the lowest RMSE where SVD++, SVD and BaselineOnly. I will be using those algorithms to create my predictions.

## Model Evaluation

To perform the model evaluation I wont use the test dataset already provided because during the evaluation process is done based on the variable "r_ui" after the prediction process. This variable is a mix of integers and strings data types.

Beacouse of this I will do the split process with the train data set already provided.

In [63]:
# Selecting Columns
d_col =  ['userID', 'asin','overall']
data = train[d_col]

In [64]:
# Create our train and test set for our model predictions and Evaluations.

# Reader rating scale setup 1 for lowest and 5 for highest
datas = Reader(rating_scale=(1, 5))
data_sup=Dataset.load_from_df(data,datas)

# Split the data in train 80% and test 20%
train_df,test_df=train_test_split(data,test_size=0.2,random_state=613)
train_df.reset_index(drop=True,inplace=True)
test_df.reset_index(drop=True,inplace=True)
datas = Reader(rating_scale=(1, 5))
train2=Dataset.load_from_df(train_df,datas).build_full_trainset()
test2=list(test_df.itertuples(index=False,name=None))

Now lets apply the train2 and test2 to our predictions

### SVD++

In [65]:
# SVD++ Model Prediction
svdpp = SVDpp()
svdpp.fit(train2)
prediction_svdpp=svdpp.test(test2)

### SVD

In [66]:
# SVD Model Prediction
svd = SVD()
svd.fit(train2)
prediction_svd=svd.test(test2)

### BaselineOnly

In [67]:
# BaselineOnly Model Prediction
baselineonly = BaselineOnly()
baselineonly.fit(train2)
prediction_baselineonly=baselineonly.test(test2)

Estimating biases using als...


## Prediction Evaluation

In [68]:
# SVD++ Prediction Evaluation
eval_svdpp=eval.evaluate(prediction=prediction_svdpp,topn=15,rating_cutoff=0.70,excl_impossible=True)

# SVD Prediction Evaluation
eval_svd=eval.evaluate(prediction=prediction_svd,topn=15,rating_cutoff=0.70,excl_impossible=True)

# BaselineOnly Prediction Evaluation
eval_baselineonly=eval.evaluate(prediction=prediction_baselineonly,topn=15,rating_cutoff=0.70,excl_impossible=True)


Excluded 0 (32351) samples. 32351 remaining ...
Excluded 0 (32351) samples. 32351 remaining ...
Excluded 0 (32351) samples. 32351 remaining ...
Excluded 0 (32351) samples. 32351 remaining ...
Excluded 0 (32351) samples. 32351 remaining ...
Excluded 0 (32351) samples. 32351 remaining ...


In [69]:
# Putting all togheter in the same data frame 
all_evals=pd.DataFrame(eval_svdpp)
all_evals['SVD++']=eval_svdpp['value']
all_evals['SVD']=eval_svd['value']
all_evals['BaselineOnly']=eval_baselineonly['value']
all_evals=all_evals.drop(['value'],axis=1)
all_evals

Unnamed: 0,SVD++,SVD,BaselineOnly
RMSE,1.078716,1.079869,1.08459
MAE,0.787148,0.799613,0.81828
Recall,1.0,1.0,1.0
Precision,1.0,1.0,1.0
F1,1.0,1.0,1.0
NDCG@15,0.911032,0.909081,0.909148


Evaluation Analisys.

* RMSE: This is a measure of the error of the models where alwayes we should use the model with the lowest values. In this case we see that the model with the highest error is BaselineOnly and SVD++ and SVD have lower values.
Something to consider is that during the process of this project the lowest value was switching sides between SVD++ and SVD.
* MAE: This measure refers to how the prediction of an observation is similar to real value of the observation. In this case we can se that BaseLineOnly has the major value.
* RECALL: This measure refers to the identification to the true positives. In our case all the three models have the same value, which mean that all three models identify 100% the true positives.
* PRECISION: This measure reflects the quality of the positive prediction of the the three models. In this case all of them have the same value.
* F1: This measure is the weighted average of Precision and Recall. In our case all three models have the same score.


Now lets use the train set provided to the create the predictions for the Kaggle competition

### 1) Singular Vector Decomposition (SVD)

The SVD is a matrix factorisation technique, which reduces the number of features of a dataset by reducing the space dimension from N-dimension to K-dimension (where K<N). In this technique each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

This methods generalizes the eigendencomposition of a square matrix (n x n) to any matrix (n x m).

Now lets prepare the data for the Kaggle competition 1st submittion

In [70]:
# Surprise training and test set
df_train1 = Dataset.load_from_df(train, reader).build_full_trainset()

# Here I am renaming the test dataset to make it consistent with the file example to submit it Kaggle
test_s=pd.DataFrame()
test_s['uid']=test['userID']
test_s['iid']=test['asin']
test_s['r_ui']=test['ID']
df_test = list(test_s.itertuples(index=False, name=None))

#### PREDICTION SVD

In [71]:
svd=SVD()

# SVD train data frame fit
svd.fit(df_train1)

# Now 
prediction_1=svd.test(df_test)

In [72]:
# Visualization of the prediction_1 as data frame
Prediction_1=pd.DataFrame(prediction_1)
Prediction_1

Unnamed: 0,uid,iid,r_ui,est,details
0,21069,B00BFK2B24,21069B00BFK2B24,3.453793,{'was_impossible': False}
1,3506,B00ZK0Y7R2,3506B00ZK0Y7R2,3.406495,{'was_impossible': False}
2,21907,B0002AQPA2,21907B0002AQPA2,4.147731,{'was_impossible': False}
3,14092,B0002DHXX2,14092B0002DHXX2,4.646227,{'was_impossible': False}
4,3085,B0006VB3SQ,3085B0006VB3SQ,3.976079,{'was_impossible': False}
...,...,...,...,...,...
76038,9343,B004GFN2ZA,9343B004GFN2ZA,4.283108,{'was_impossible': False}
76039,17932,B000JZOQO2,17932B000JZOQO2,4.239743,{'was_impossible': False}
76040,14272,B005440HLO,14272B005440HLO,3.920995,{'was_impossible': False}
76041,11151,B0002VAZSY,11151B0002VAZSY,4.378849,{'was_impossible': False}


In [None]:
### Exporting to CSV file for Kaggle first submition.
final_1=pd.DataFrame()
final_1['ID']=Prediction_1['r_ui']
final_1['overall']=Prediction_1['est']
final_1.to_csv('Omar_1.csv',index=False)

### 2)  Singular Vector Decomposition plus plus (SVDpp)

The SVDpp algorithm is an extension of SVD that takes into account implicit ratings

Li, S., 2022. Building and Testing Recommender Systems With Surprise, Step-By-Step. [online] Medium. Available at: <https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b> [Accessed 6 March 2022].

#### Prediction SVDpp()

In [73]:
svdpp=SVDpp()

# SVDpp train data frame fit
svdpp.fit(df_train1)

# Now
prediction_2=svdpp.test(df_test)

In [74]:
# Visualization of the prediction_2 as data frame
Prediction_2=pd.DataFrame(prediction_2)
Prediction_2

Unnamed: 0,uid,iid,r_ui,est,details
0,21069,B00BFK2B24,21069B00BFK2B24,3.310369,{'was_impossible': False}
1,3506,B00ZK0Y7R2,3506B00ZK0Y7R2,3.409923,{'was_impossible': False}
2,21907,B0002AQPA2,21907B0002AQPA2,3.943403,{'was_impossible': False}
3,14092,B0002DHXX2,14092B0002DHXX2,4.938923,{'was_impossible': False}
4,3085,B0006VB3SQ,3085B0006VB3SQ,4.219306,{'was_impossible': False}
...,...,...,...,...,...
76038,9343,B004GFN2ZA,9343B004GFN2ZA,4.430889,{'was_impossible': False}
76039,17932,B000JZOQO2,17932B000JZOQO2,4.500984,{'was_impossible': False}
76040,14272,B005440HLO,14272B005440HLO,3.936457,{'was_impossible': False}
76041,11151,B0002VAZSY,11151B0002VAZSY,3.961824,{'was_impossible': False}


In [None]:
# Exporting to CSV file for Kaggle second submition
final_2=pd.DataFrame()
final_2['ID']=Prediction_2['r_ui']
final_2['overall']=Prediction_2['est']
final_2.to_csv('Omar_2.csv',index=False)

### 3) BaselineOnly

This algorithms is used to predict the baseline estimate for given user and item. This algorithms could be used to compare how well perform the other two algorithms.

Li, S., 2022. Building and Testing Recommender Systems With Surprise, Step-By-Step. [online] Medium. Available at: <https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b> [Accessed 6 March 2022].

#### Prediction BaselineOnly()

In [75]:
BaselineOnly=BaselineOnly()

# BaselineOnly train data frame fit
BaselineOnly.fit(df_train1)

# Now
prediction_3=BaselineOnly.test(df_test)

Estimating biases using als...


In [76]:
# Visualization of the prediction_3 as data frame
Prediction_3=pd.DataFrame(prediction_3)
Prediction_3

Unnamed: 0,uid,iid,r_ui,est,details
0,21069,B00BFK2B24,21069B00BFK2B24,3.631746,{'was_impossible': False}
1,3506,B00ZK0Y7R2,3506B00ZK0Y7R2,3.690602,{'was_impossible': False}
2,21907,B0002AQPA2,21907B0002AQPA2,4.182187,{'was_impossible': False}
3,14092,B0002DHXX2,14092B0002DHXX2,4.429364,{'was_impossible': False}
4,3085,B0006VB3SQ,3085B0006VB3SQ,4.305398,{'was_impossible': False}
...,...,...,...,...,...
76038,9343,B004GFN2ZA,9343B004GFN2ZA,4.344516,{'was_impossible': False}
76039,17932,B000JZOQO2,17932B000JZOQO2,4.278848,{'was_impossible': False}
76040,14272,B005440HLO,14272B005440HLO,3.912084,{'was_impossible': False}
76041,11151,B0002VAZSY,11151B0002VAZSY,4.201135,{'was_impossible': False}


In [None]:
# Exporting to CSV file for Kaggle third submition
final_3=pd.DataFrame()
final_3['ID']=Prediction_3['r_ui']
final_3['overall']=Prediction_3['est']
final_3.to_csv('Omar_3.csv',index=False)

## Cross-Validation Algorithms

### Tunning Hyperparameters for SVD()
Function GridSearchCV()

In this step I am setting the parameters combination so GridSearch() can do an exhaustive try of combinations to determine and provide the best parametter for SVD() which was the algorithms with the best evaluation measures.


Parameters:

* n_factors – The number of factors. Default is 100.
* n_epochs – The number of iteration of the SGD procedure. Default is 20.
* biased (bool) – Whether to use baselines (or biases). See note above. Default is True.
* init_mean – The mean of the normal distribution for factor vectors initialization. Default is 0.
* init_std_dev – The standard deviation of the normal distribution for factor vectors initialization. Default is 0.1.
* lr_all – The learning rate for all parameters. Default is 0.005.
* reg_all – The regularization term for all parameters. Default is 0.02.
* lr_bu – The learning rate for 𝑏𝑢. Takes precedence over lr_all if set. Default is None.
* lr_bi – The learning rate for 𝑏𝑖. Takes precedence over lr_all if set. Default is None.
* lr_pu – The learning rate for 𝑝𝑢. Takes precedence over lr_all if set. Default is None.
* lr_qi – The learning rate for 𝑞𝑖. Takes precedence over lr_all if set. Default is None.
* reg_bu – The regularization term for 𝑏𝑢. Takes precedence over reg_all if set. Default is None.
* reg_bi – The regularization term for 𝑏𝑖. Takes precedence over reg_all if set. Default is None.
* reg_pu – The regularization term for 𝑝𝑢. Takes precedence over reg_all if set. Default is None.
* reg_qi – The regularization term for 𝑞𝑖. Takes precedence over reg_all if set. Default is None.
* random_state (int, RandomState instance from numpy, or None) – Determines the RNG that will be used for initialization. If int, random_state will be used as a seed for a new RNG. This is useful to get the same initialization over multiple calls to fit(). If RandomState instance, this same instance is used as RNG. If None, the current RNG from numpy is used. Default is None.
* verbose – If True, prints the current epoch. Default is False.

In my case I am not mentioning/typing in the code the parameters that are None by default (Not need to type) and only playing with n_factors, n_epochs, lr_all and reg_all specifically.

In [77]:
# Preparing my data.
data = Dataset.load_from_df(train, reader)

In [88]:
param_grid = {'n_factors':[5,10,15,20,50,100,150],'n_epochs':[5,10,15,20,50,100,150],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)


In [None]:
# Fit the GridSearch() best parameters to the data
gs.fit(data)
params = gs.best_params['rmse']

In [37]:
# These are the parameters suggested after GridSearchCV() which were used for the svdtuned prediction
print(params)

{'n_factors': 150, 'n_epochs': 150, 'lr_all': 0.01, 'reg_all': 0.1}


This are the best parameters done during the Cross-Validation for SVD which is my best score in Kaggle.
* n_factors: 150
* n_epochs: 150
* lr_all: 0.01
* reg_all: 0.1

In [36]:
# Apply the new best parameters to the SVP() Algorithm.
svdtuned = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])
svdtuned.fit(df_train1)
prediction_4=svdtuned.test(df_test)
prediction_4=pd.DataFrame(prediction_4)
prediction_4


Unnamed: 0,uid,iid,r_ui,est,details
0,21069,B00BFK2B24,21069B00BFK2B24,2.636915,{'was_impossible': False}
1,3506,B00ZK0Y7R2,3506B00ZK0Y7R2,3.912990,{'was_impossible': False}
2,21907,B0002AQPA2,21907B0002AQPA2,3.470442,{'was_impossible': False}
3,14092,B0002DHXX2,14092B0002DHXX2,4.585183,{'was_impossible': False}
4,3085,B0006VB3SQ,3085B0006VB3SQ,4.547547,{'was_impossible': False}
...,...,...,...,...,...
76038,9343,B004GFN2ZA,9343B004GFN2ZA,4.396721,{'was_impossible': False}
76039,17932,B000JZOQO2,17932B000JZOQO2,4.868898,{'was_impossible': False}
76040,14272,B005440HLO,14272B005440HLO,4.298381,{'was_impossible': False}
76041,11151,B0002VAZSY,11151B0002VAZSY,4.396025,{'was_impossible': False}


In [38]:
# Exporting for Kaggle Submition
final_4=pd.DataFrame()
final_4['ID']=prediction_4['r_ui']
final_4['overall']=prediction_4['est']
final_4.to_csv('Omar_4.csv',index=False)

## Conclutions:

* We can see that the RMSE values can variate between SDV and SDVpp, which in my case I toke SDV which was in the majority of the time the lowest value.
* Across all the algorithms tested, SVD and SVD++ were the ones with the best perfomance. This could be related to the factorization technique which reduces the number of features. 
* The the cross-validation through GridSearchCV is a process that takes time, depending of the quatinty of parameters to evaluate.
* The test dataset provide has a value called UID which was a mix of integers and string data values. This could not be used straight to the eval procedure due the RMSE and MAE computations are done based on the r_ui which in our case was the UID (after prediction process).

## References:

https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

https://towardsdatascience.com/svd-where-model-tuning-goes-wrong-61c269402919

https://buomsoo-kim.github.io/recommender%20systems/2020/10/22/Recommender-systems-collab-filtering-14.md/

https://pypi.org/project/scikit-surprise/

https://analyticsindiamag.com/singular-value-decomposition-svd-application-recommender-system/

https://towardsdatascience.com/simple-svd-algorithms-13291ad2eef2

https://scholarworks.calstate.edu/downloads/1n79h8686

https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/#:~:text=F1%20score%20%2D%20F1%20Score%20is,have%20an%20uneven%20class%20distribution.

https://surprise.readthedocs.io/en/stable/matrix_factorization.html