# Supervised user based collaborative filtering

## Import Libraries

In [1]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# visualization
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")



## Loading and cleaning data

In [2]:
# CSV file called "anime.csv" from a directory called raw_data and returns the contents as a Pandas DataFrame
anime = pd.read_csv(raw_data + "/" + "anime.csv") 

# CSV file called "rating.csv.zip" from a directory called raw_data and returns the contents as a Pandas DataFrame
rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

In [3]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
rating.shape

(7813737, 3)

In [5]:
rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [None]:
print(supervised_rating_cleaning.__doc__)

Steps of supervised_rating_cleaning:
- selects only those rows from the 'rating' DataFrame where the value of the 'rating' column is greater than 0. The resulting DataFrame is assigned to the variable 'ratingdf'.
- resets the index of the 'ratingdf' DataFrame. This means that the current index is replaced with a sequential index starting from 0, and a new column called 'index' is added to the DataFrame to store the old index values.
- drops the 'index' column from the 'ratingdf' DataFrame. The 'axis=1' argument specifies that the column should be dropped, and 'inplace=True' means that the changes should be made to the DataFrame in place (i.e., the DataFrame is modified directly rather than creating a copy).
- returns the modified 'ratingdf' DataFrame as the output of the function.

In [6]:
# Cleaning the data
ratingdf = supervised_rating_cleaning(rating)

In [7]:
ratingdf.shape

(6337241, 3)

## Preparing the data for baseline

In here we are goin to prepare de data to be used in the model baseline.

To do a baseline we will use a smaller dataset and then use the whole dataset in the selected mode. Because using a smaller dataset for prototyping and testing can be a good way to quickly iterate and experiment with different algorithms and hyperparameters before scaling up to the full dataset.

In [48]:
def prepare_for_different_models():
    '''
    The code reads two CSV files (anime.csv and rating.csv.zip) and loads them into dataframes. 
    Then it creates a subset of the rating dataframe containing only rows where the rating is 
    greater than 0 and removes the index column. Next, it samples a subset of the data with 
    a specified size, grouped by the rating column.
    '''
    # Load 'anime.csv' file into a pandas DataFrame object called 'anime'
    anime = pd.read_csv(raw_data + "/" + "anime.csv")

    # Load 'rating.csv.zip' file into a pandas DataFrame object called 'rating'
    rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

    # Create a new DataFrame 'anime_mapping' that is a copy of the 'anime' DataFrame and remove the 'episodes', 'members', and 'rating' columns
    anime_mapping = anime.copy()
    anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

    # Filter out all ratings less than or equal to 0 and reset the index of the DataFrame
    ratingdf = rating[rating.rating>0]
    ratingdf = ratingdf.reset_index()

    # Drop the 'index' column and update the DataFrame in-place
    ratingdf.drop('index', axis=1, inplace=True)

    # Get the shape of the DataFrame 'ratingdf'
    ratingdf.shape

    # Set the size to 100,000 and sample from the 'ratingdf' DataFrame based on the proportion of ratings for each score
    size = 100000

    # This will make sure that the sampled data has roughly the same proportion of ratings for each score as the original data
    ratingdf_sample = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)

    # Create a new 'Reader' object with the rating scale set to a range between 1 and 10
    reader = Reader(rating_scale=(1,10))

    # Load the sampled data into a 'Dataset' object using the 'load_from_df' method and the 'reader' object
    data = Dataset.load_from_df(ratingdf_sample[['user_id', 'anime_id', 'rating']], reader)

    # Saving the table to pickle
    joblib.dump(data,processed_data + "/" + "data_reader_for_different_models.pkl")

    return data
data = prepare_for_different_models()

## Model Baseline

The next function that performs cross-validation for several collaborative filtering algorithms using the Surprise library and returns the results in a pandas DataFrame. The function takes a Surprise dataset object as input and outputs two DataFrames, one with the results for each individual algorithm and another with the results for all algorithms.

The algorithms used in this function are SVD, SVD++, SlopeOne, NMF, NormalPredictor, BaselineOnly, and CoClustering. For each algorithm, the function performs 5-fold cross-validation and computes the RMSE, MSE, MAE, and FCP metrics. The results are then stored in a DataFrame and saved to a file.

Note that the function only runs the SVD algorithm by default. To run other algorithms, you need to uncomment the relevant lines in the for loop. Also, the function assumes that the saved_models_folder variable has been defined elsewhere in the code.

Overall, this function provides a basic implementation of collaborative filtering algorithms using the Surprise library and can be used as a starting point for building more sophisticated recommendation systems

In [51]:
from surprise.model_selection import KFold
'''
Function that runs several collaborative filtering algorithms on an input dataset using cross-validation 
and computes several evaluation metrics. The function loops through a list of algorithms, 
runs cross-validation with each algorithm, computes the mean evaluation results across all folds, 
and appends the results to an overall list of results. It then saves the evaluation results 
of each algorithm in a Parquet file and saves the overall evaluation results of all algorithms 
in another Parquet file. The function returns the overall evaluation results.
'''
def baseline_all(data):
    
    # create an empty list to hold the benchmark results for all algorithms
    benchmark = []

    # instantiate the collaborative filtering algorithms we want to evaluate
    svd = SVD()
    svdp = SVDpp()
    slpo = SlopeOne()
    nm  = NMF()
    nmlp = NormalPredictor()
    baseonly = BaselineOnly()
    coclus = CoClustering()

    # loop through each algorithm and evaluate it using 5-fold cross-validation
    for algorithm in [svd,svdp,slpo,nm,nmlp,baseonly,coclus]:

        # create an empty list to hold the benchmark results for this algorithm
        benchmark_inndividual = []
        
        # print a message to indicate which algorithm is being evaluated
        print(algorithm,"started")

        folds = KFold(n_splits=5, shuffle=True, random_state=42)

        # run 5-fold cross-validation and compute RMSE, MSE, MAE, and FCP metrics
        results = cross_validate(algorithm, data, measures=['RMSE','MSE','MAE','FCP'], cv=folds, verbose=True)

        # print a message to indicate that the algorithm has finished evaluating
        print(algorithm,"finished")

        # calculate the mean of each metric over the 5 folds and store the results in a DataFrame
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)

        # extract the name of the algorithm from the object and append it to the DataFrame
        name = str(algorithm).split(' ')[0].split('.')[-1]
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))

        # add the results for this algorithm to the list of individual benchmark results
        benchmark_inndividual.append(tmp)

        # add the results for this algorithm to the list of global benchmark results
        benchmark.append(tmp)

        # convert the list of results for this algorithm to a DataFrame and save it to a file
        dfscores_individual = pd.DataFrame(benchmark_inndividual).set_index('Algorithm').sort_values('test_rmse')
        write(saved_models_folder + "/" + name + "_results.parq", dfscores_individual)
    
    # convert the list of benchmark results for all algorithms to a DataFrame and save it to a file
    dfscores = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    write(saved_models_folder + "/" + "Others_Models_results.parq", dfscores)

    # return the DataFrame with the benchmark results for all algorithms
    return dfscores

In [52]:
baseline_all(data)

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001B9C9710760> started
Evaluating RMSE, MSE, MAE, FCP of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1336  1.1320  1.1339  1.1326  1.1328  1.1330  0.0007  
MSE (testset)     1.2850  1.2814  1.2857  1.2827  1.2832  1.2836  0.0016  
MAE (testset)     0.8447  0.8450  0.8448  0.8446  0.8446  0.8447  0.0001  
FCP (testset)     0.7373  0.7360  0.7372  0.7370  0.7381  0.7371  0.0007  
Fit time          132.24  123.20  125.49  134.83  130.43  129.24  4.29    
Test time         57.05   44.47   41.61   37.90   44.47   45.10   6.44    
<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001B9C9710760> finished
<surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001B9C97106D0> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Evaluating RMSE, MSE, MAE, FCP of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1905  1.1902  1.1933  1.1947  1.1906  1.1919  0.0018  
MSE (testset)     1.4173  1.4166  1.4239  1.4274  1.4176  1.4206  0.0043  
MAE (testset)     0.8855  0.8863  0.8873  0.8894  0.8855  0.8868  0.0014  
FCP (testset)     0.7319  0.7302  0.7301  0.7303  0.7314  0.7308  0.0007  
Fit time          3544.44 3473.66 3421.99 3368.28 7073.26 4176.33 1449.64 
Test time         847.75  880.16  797.56  877.29  883.70  857.29  32.50   
<surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x000001B9C97106D0> finished
<surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001B9C97107C0> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Evaluating RMSE, MSE, MAE, FCP of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1971  1.1951  1.1978  1.1960  1.1967  1.1965  0.0009  
MSE (testset)     1.4330  1.4283  1.4346  1.4303  1.4322  1.4317  0.0022  
MAE (testset)     0.9044  0.9031  0.9040  0.9037  0.9037  0.9038  0.0004  
FCP (testset)     0.7118  0.7116  0.7120  0.7120  0.7118  0.7118  0.0002  
Fit time          188.06  193.80  180.81  180.25  178.85  184.35  5.71    
Test time         491.60  476.12  462.69  494.10  473.05  479.51  11.79   
<surprise.prediction_algorithms.slope_one.SlopeOne object at 0x000001B9C97107C0> finished
<surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001B9C9710640> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Evaluating RMSE, MSE, MAE, FCP of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.2375  2.2343  2.2268  2.2285  2.2243  2.2303  0.0049  
MSE (testset)     5.0063  4.9921  4.9587  4.9661  4.9473  4.9741  0.0218  
MAE (testset)     1.9860  1.9834  1.9756  1.9770  1.9727  1.9789  0.0050  
FCP (testset)     0.6937  0.6924  0.6938  0.6932  0.6934  0.6933  0.0005  
Fit time          224.69  225.17  228.94  230.22  227.73  227.35  2.13    
Test time         58.46   46.38   56.34   56.19   55.78   54.63   4.23    
<surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001B9C9710640> finished
<surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001B9C9710520> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Evaluating RMSE, MSE, MAE, FCP of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.1492  2.1462  2.1485  2.1482  2.1491  2.1482  0.0011  
MSE (testset)     4.6192  4.6063  4.6160  4.6147  4.6185  4.6150  0.0046  
MAE (testset)     1.7055  1.7029  1.7050  1.7042  1.7047  1.7045  0.0009  
FCP (testset)     0.4977  0.4961  0.4978  0.4966  0.4981  0.4973  0.0008  
Fit time          15.29   18.28   17.95   18.00   17.57   17.42   1.09    
Test time         58.97   58.55   60.35   58.39   46.78   56.61   4.96    
<surprise.prediction_algorithms.random_pred.NormalPredictor object at 0x000001B9C9710520> finished
<surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001B9C9710730> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MSE, MAE, FCP of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2064  1.2036  1.2071  1.2056  1.2061  1.2058  0.0012  
MSE (testset)     1.4553  1.4486  1.4571  1.4534  1.4548  1.4539  0.0029  
MAE (testset)     0.9173  0.9157  0.9172  0.9170  0.9169  0.9168  0.0006  
FCP (testset)     0.7071  0.7070  0.7072  0.7070  0.7070  0.7071  0.0001  
Fit time          28.92   31.63   32.12   31.98   32.96   31.52   1.37    
Test time         50.63   50.88   49.84   40.42   37.89   45.93   5.60    
<surprise.prediction_algorithms.baseline_only.BaselineOnly object at 0x000001B9C9710730> finished
<surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001B9C9710880> started


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Evaluating RMSE, MSE, MAE, FCP of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2144  1.2127  1.2099  1.2097  1.2083  1.2110  0.0022  
MSE (testset)     1.4749  1.4707  1.4638  1.4634  1.4600  1.4666  0.0054  
MAE (testset)     0.9180  0.9170  0.9138  0.9145  0.9131  0.9153  0.0019  
FCP (testset)     0.7097  0.7104  0.7117  0.7122  0.7118  0.7112  0.0009  
Fit time          355.75  338.44  373.09  370.90  355.04  358.64  12.56   
Test time         41.79   68.12   56.65   60.35   41.96   53.78   10.40   
<surprise.prediction_algorithms.co_clustering.CoClustering object at 0x000001B9C9710880> finished


  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.132956,1.283591,0.844734,0.737092,129.236504,45.10105
SVDpp,1.19188,1.42058,0.886783,0.730786,4176.3256,857.291792
SlopeOne,1.19653,1.431685,0.90378,0.711841,184.353579,479.512936
BaselineOnly,1.205758,1.453853,0.916848,0.707061,31.520678,45.932104
CoClustering,1.211017,1.466567,0.915281,0.711172,358.642635,53.776164
NormalPredictor,2.148245,4.614956,1.704465,0.497267,17.417936,56.607932
NMF,2.230265,4.974104,1.978937,0.693325,227.350313,54.632843


### Merging results from Model Baseline

In [None]:
df_others_results = pd.read_parquet(saved_models_folder + "/" + "Others_Models_results.parq", engine='fastparquet')
df_others_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785


In [None]:
df_KNNBasic_results = pd.read_parquet(saved_models_folder + "/" + "KNNBasic_results.parq", engine='fastparquet')
df_KNNBasic_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444


In [None]:
df_KNNBaseline_results = pd.read_parquet(saved_models_folder + "/" + "KNNBaseline_results.parq", engine='fastparquet')
df_KNNBaseline_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859


In [None]:
df_knn_results = pd.read_parquet(saved_models_folder + "/" + "KNN_Models_results.parq", engine='fastparquet')
df_knn_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209
KNNWithZScore,1.666752,2.778087,1.267275,0.468789,36.883558,2.03843


In [None]:
vertical_concat = pd.concat([df_others_results, df_KNNBasic_results,df_KNNBaseline_results,df_knn_results], axis=0)

In [None]:
vertical_concat.head(20)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209


In [None]:
listatests =  ["test_rmse","test_mse","test_mae","test_fcp"]
for i in listatests:
    print ("the best result in",i,"is",vertical_concat.iloc[vertical_concat[i].argmin(), 0:1])

the best result in test_rmse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mae is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_fcp is test_rmse    1.701391
Name: SlopeOne, dtype: float64


## Evaluating Selected models

We have selected the 4 models with best results in the baseline. Now lets evaluate them doing GridSearchCV and training them to get the results.

- svd = SVD()
- svdp = SVDpp()
- baseonly = BaselineOnly()
- knnbase = KNNBaseline()

### SVD

Singular Value Decomposition (SVD) is a matrix factorization technique used in recommendation systems to reduce the dimensionality of a user-item matrix and identify latent factors that drive user-item interactions. In essence, SVD represents the original matrix as the product of three matrices: a user matrix, a singular value matrix, and an item matrix. The resulting factors can be used to make personalized recommendations by predicting a user's preference for an item based on their past behavior and the behavior of other similar users.

### SVDpp

SVD++ is an extension of Singular Value Decomposition (SVD) that takes into account implicit feedback in recommendation systems. In addition to the user-item matrix used in SVD, SVD++ also considers a matrix of implicit feedback such as user interactions with items, item attributes, and user preferences. This additional matrix helps capture the influence of user and item biases on the recommendation process, resulting in more accurate and personalized recommendations. SVD++ also includes a regularization term to avoid overfitting and improve generalization. Overall, SVD++ is a more advanced and sophisticated technique for recommendation systems than SVD.

### BaselineOnly

BaselineOnly is a simple but effective collaborative filtering algorithm used in recommendation systems. It predicts a user's rating for an item by taking into account the overall average rating of all items, the average rating of the user, and the average rating of the item. These three values are used as baseline estimates, and the difference between the actual rating and the baseline estimate is used as the prediction error. BaselineOnly then learns user and item biases that can improve the accuracy of the baseline estimates. The algorithm is simple and computationally efficient, making it a popular choice for recommendation systems with large datasets. However, it may not capture more complex relationships between users and items compared to more advanced techniques such as matrix factorization.BaselineOnly is a simple but effective collaborative filtering algorithm used in recommendation systems. It predicts a user's rating for an item by taking into account the overall average rating of all items, the average rating of the user, and the average rating of the item. These three values are used as baseline estimates, and the difference between the actual rating and the baseline estimate is used as the prediction error. BaselineOnly then learns user and item biases that can improve the accuracy of the baseline estimates. The algorithm is simple and computationally efficient, making it a popular choice for recommendation systems with large datasets. However, it may not capture more complex relationships between users and items compared to more advanced techniques such as matrix factorization.

### KNNBaseline

KNNBaseline is a collaborative filtering algorithm used in recommendation systems that predicts a user's rating for an item based on the ratings of similar users or similar items. The algorithm is based on k-nearest neighbors, where k is the number of most similar users or items used to make the prediction. KNNBaseline uses a baseline estimate, which is similar to BaselineOnly, to normalize the ratings and improve the accuracy of the predictions. Additionally, KNNBaseline uses a similarity metric, such as cosine similarity or Pearson correlation, to measure the similarity between users or items. The algorithm can be used for both user-based and item-based recommendation systems and has been shown to produce accurate and effective recommendations for a wide range of datasets.

## Evaluation and training the final selected SVD model

In [2]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import sys # to manipulate different parts of the Python runtime environment

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *



In roder to evaluate, select and train the desired model, we will use 3 different funcions from clening.py, testing.py and testing.py in utils folder:
- supervised_prepare_training
- find_best_svd
- train_test_svd

In [None]:
print(supervised_prepare_training.__doc__)

This function is to:
- Load 'anime.csv' file into a pandas DataFrame object called 'anime'
- Load 'rating.csv.zip' file into a pandas DataFrame object called 'rating'
- Create a new DataFrame 'anime_mapping' that is a copy of the 'anime' DataFrame and remove the 'episodes', 'members', and 'rating' columns
- Filter out all ratings less than or equal to 0 and reset the index of the DataFrame
- Drop the 'index' column and update the DataFrame in-place
- Get the shape of the DataFrame 'ratingdf'
- Set the size to 100,000 and sample from the 'ratingdf' DataFrame based on the proportion of ratings for each score
- This will make sure that the sampled data has roughly the same proportion of ratings for each score as the original data
- Create a new 'Reader' object with the rating scale set to a range between 1 and 10
- Load the sampled data into a 'Dataset' object using the 'load_from_df' method and the 'reader' object
- Saving the table to pickle

In [None]:
supervised_prepare_training() #utils.cleaning

<surprise.dataset.DatasetAutoFolds at 0x2cafb97a640>

In [9]:
print(find_best_svd.__doc__)

None


Steps of find_best_svd:
- Define parameter grid for grid search
- Create GridSearchCV object with SVD algorithm
- Fit GridSearchCV object to data
- Print best RMSE and MAE scores, as well as corresponding parameters
- Save model with best parameters
- Save best parameters

In [3]:
find_best_svd() #utils.testing

Best RMSE score: 1.3805614676765472
Best MAE score: 1.0649921646188643
Best parameters for RMSE: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
Best parameters for MAE: {'n_factors': 50, 'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.1}


<surprise.model_selection.search.GridSearchCV at 0x2cafb917ac0>

In [None]:
print(train_test_svd.__doc__)

Steps of train_test_svd:
- Loads the best hyperparameters for the SVD algorithm that were obtained from grid search
- Loads the dataset from a pickle file using joblib
- Splits the data into training and testing sets with a 80:20 ratio
- Creates an instance of the SVD algorithm with the best hyperparameters obtained from grid search
- Trains the SVD algorithm on the training set using the fit() method
- Generates predictions for the test set using the trained model
- Calculates the RMSE and MAE for the predictions
- Saves the trained model as a pickle file using joblib

In [3]:
train_test_svd() #utils.training

RMSE: 1.3757
MAE:  1.0561
RMSE: 1.375737289908081
MAE: 1.056142242807008


## Getting recommendations

To get the recommendations we will use the next functions from recommend.py in utils folder:
- sort_it
- create_dict_su
- filtering_and
- filtering_or

In [None]:
print(sort_it.__doc__)

Steps of sort_it:
- Load the pre-trained SVD model   
- Load the anime dataframe  
- Apply the SVD model to estimate the score for each anime
- Sort the dataframe by the estimated score in descending order and drop the anime_id column
- Create a blank index for the dataframe
- Set the blank index to the dataframe
- Return the sorted dataframe

In [None]:
print(create_dict_su.__doc__)

Steps of create_dict_su:
- get the final dataframe and the parameters for filtering and number of recommendations to show
- check which method was used to filter the recommendations, 'or' or 'and'
	- filter the dataframe using the OR logic and the given genres and types
	- filter the dataframe using the AND logic and the given genres and types
	- raise an error if an invalid filter type was given     
- select the top n recommendations from the filtered dataframe
- if the filtered dataframe is empty, print a message
	- convert the filtered dataframe to a dictionary
	- return the dictionary of recommendations

In [None]:
print(filtering_and.__doc__)


    This function takes a DataFrame df, a list of genres, and a list of types as input arguments. 
    The function first creates a boolean mask genre_mask by applying a lambda function to 
    the 'genre' column of the DataFrame. The lambda function checks if the value is a 
    string using isinstance(x, str) and if all genres in the genres list are present 
    in the string, which is split by comma and space using x.split(', '). 
    The all() function returns True if all genres in the genres list are present 
    in the string. The resulting genre_mask will be True for rows where the genre 
    column contains all of the genres in the genres list.

    Then the function creates another boolean mask type_mask by using the isin() 
    method to check if each value in the 'type' column of the DataFrame is in the types list.

    Finally, the function applies both masks to the DataFrame df using the & operator 
    to create a new DataFrame filtered_df that includes only rows where b

Steps of filtering_and:
- This function takes a DataFrame `df`, a list of `genres`, and a list of `types` as input arguments.
- Create a boolean mask that filters rows where the genre column contains all of the genres in the `genres` list.
- Create a boolean mask that filters rows where the type column is in the `types` list.
- Apply both masks to the DataFrame `df` and create a new DataFrame `filtered_df` that includes only rows where both masks are True.
- Return the filtered DataFrame.

In [None]:
print(filtering_or.__doc__)


    The code defines a function "filtering_or" that filters a pandas dataframe based on user-defined 
    genres and types using an "OR" method. The function allows the user to select one or all possible 
    genres and types and returns a filtered dataframe with the selected genres and types. 
    The function also splits the genre and type columns and explodes them to account for multiple entries.
    


Steps of filtering_or:
- Make a copy of the input DataFrame
- Split the genre column into a list of genres
- Explode the genre column to create a new row for each genre in the list
- If genres are specified and 'ALL' is not one of them, filter the DataFrame to keep only rows where the genre is in the specified list  
- If types are specified and 'ALL' is not one of them, filter the DataFrame to keep only rows where the type is in the specified list
- If both genres and types are specified
- If 'ALL' is in the genres list, set genres to be all the unique genres in the filtered DataFrame
- If 'ALL' is in the types list, set types to be all the unique types in the filtered DataFrame
- Filter the DataFrame to keep only rows where the genre is in the genres list AND the type is in the types list
- Return the filtered DataFrame

In [4]:
# We can get the recommendation as a dictionary
# We input the user ID for we want the recommendations
# Then the genre we want (or write "All" if we shoose "or" filter)
# Then the type we want (or write "All" if we shoose "or" filter)
# We must select a type or filtering, "or"/"and" 
# Then the number of suggestions we have(we might get less if there not so many o none if there is no matches)

create_dict_su(sort_it(25000),["Shounen"],["TV"],"or",10)

[{'name': 'Fullmetal Alchemist: Brotherhood',
  'english_title': 'Fullmetal Alchemist: Brotherhood',
  'japanses_title': '鋼の錬金術師 FULLMETAL ALCHEMIST',
  'genre': 'Shounen',
  'type': 'TV',
  'source': 'Manga',
  'duration': '24 min per ep',
  'episodes': '64',
  'rating': 'R - 17+ (violence & profanity)',
  'score': 9.11,
  'rank': 1.0,
  'members': 793665,
  'synopsis': 'After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.\r\n\r\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering 