# Supervised user based collaborative filtering

## Import Libraries

In [1]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# visualization
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")



## Loading and cleaning data

In [17]:
# loading the data
anime = anime()
rating = rating()

In [18]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [19]:
rating.shape

(7813737, 3)

In [20]:
rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [21]:
# Cleaning the data
ratingdf = supervised_rating_cleaning(rating)

## Preparing the data to try different models

In [22]:
data_sample = supervised_prepare_training(ratingdf)

## Metrics all together

In [None]:
'''
Function that runs several collaborative filtering algorithms on an input dataset using cross-validation 
and computes several evaluation metrics. The function loops through a list of algorithms, 
runs cross-validation with each algorithm, computes the mean evaluation results across all folds, 
and appends the results to an overall list of results. It then saves the evaluation results 
of each algorithm in a Parquet file and saves the overall evaluation results of all algorithms 
in another Parquet file. The function returns the overall evaluation results.
'''
def baseline_all(data):
    
    benchmark = []  # Create an empty list to store the results of each algorithm's evaluation
    # Define the algorithms to be used for collaborative filtering
    svd = SVD()
    svdp = SVDpp()
    slpo = SlopeOne()
    nm  = NMF()
    nmlp = NormalPredictor()
    baseonly = BaselineOnly()
    coclus = CoClustering()

    # Loop through the defined algorithms and run cross-validation
    for algorithm in [svd,svdp,slpo,nm,nmlp,baseonly,coclus]:

        benchmark_inndividual = []  # Create an empty list to store the evaluation results of each fold

        print(algorithm,"started")  # Print the name of the algorithm being evaluated

        # Perform cross validation with the current algorithm and the input dataset
        results = cross_validate(algorithm, data, measures=['RMSE','MSE','MAE','FCP'], cv=3, verbose=False)

        print(algorithm,"finished")  # Print the name of the algorithm after it finishes evaluation
        # Compute the mean of the evaluation results across all folds

        tmp = pd.DataFrame.from_dict(results).mean(axis=0)

        # Extract the name of the algorithm from its string representation and add it to the results
        name = str(algorithm).split(' ')[0].split('.')[-1]
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))

        benchmark_inndividual.append(tmp)  # Add the current algorithm's evaluation results to the list

        benchmark.append(tmp)  # Add the current algorithm's evaluation results to the overall list of results
        
        # Save the evaluation results of the current algorithm in a Parquet file
        dfscores_individual = pd.DataFrame(benchmark_inndividual).set_index('Algorithm').sort_values('test_rmse')
        write(saved_models_folder + "/" + name + "_results.parq", dfscores_individual)
    
    # Save the overall evaluation results of all algorithms in a Parquet file
    dfscores = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    write(saved_models_folder + "/" + "Others_Models_results.parq", dfscores)

    return dfscores  # Return the overall evaluation results of all algorithms

In [None]:
baseline_all(data_sample)

## Merge df resutls

In [None]:
df_others_results = pd.read_parquet(saved_models_folder + "/" + "Others_Models_results.parq", engine='fastparquet')
df_others_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785


In [None]:
df_KNNBasic_results = pd.read_parquet(saved_models_folder + "/" + "KNNBasic_results.parq", engine='fastparquet')
df_KNNBasic_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444


In [None]:
df_KNNBaseline_results = pd.read_parquet(saved_models_folder + "/" + "KNNBaseline_results.parq", engine='fastparquet')
df_KNNBaseline_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859


In [None]:
df_knn_results = pd.read_parquet(saved_models_folder + "/" + "KNN_Models_results.parq", engine='fastparquet')
df_knn_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209
KNNWithZScore,1.666752,2.778087,1.267275,0.468789,36.883558,2.03843


In [None]:
vertical_concat = pd.concat([df_others_results, df_KNNBasic_results,df_KNNBaseline_results,df_knn_results], axis=0)

In [None]:
vertical_concat.head(20)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209


In [None]:
listatests =  ["test_rmse","test_mse","test_mae","test_fcp"]
for i in listatests:
    print ("the best result in",i,"is",vertical_concat.iloc[vertical_concat[i].argmin(), 0:1])

the best result in test_rmse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mae is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_fcp is test_rmse    1.701391
Name: SlopeOne, dtype: float64


## Evaluation and training selected SVD model

In [2]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import sys # to manipulate different parts of the Python runtime environment

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *



In [2]:
'''
The code reads two CSV files (anime.csv and rating.csv.zip) and loads them into dataframes. 
Then it creates a subset of the rating dataframe containing only rows where the rating is 
greater than 0 and removes the index column. Next, it samples a subset of the data with 
a specified size, grouped by the rating column.
'''
supervised_prepare_training() #utils.cleaning

<surprise.dataset.DatasetAutoFolds at 0x2cafb97a640>

In [3]:
'''
Defining grid parameters for hyperparameter tuning in a collaborative filtering algorithm.
Then create a GridSearchCV object with the SVD algorithm and a parameter grid consisting 
of a range of hyperparameters. The GridSearchCV function then performs a grid search on 
yhe parameter grid to find the best combination of hyperparameters that minimizes the 
RMSE and MAE scores. The best RMSE and MAE scores and the corresponding parameters 
are printed out.
'''
find_best_svd() #utils.testing

Best RMSE score: 1.3805614676765472
Best MAE score: 1.0649921646188643
Best parameters for RMSE: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
Best parameters for MAE: {'n_factors': 50, 'n_epochs': 40, 'lr_all': 0.01, 'reg_all': 0.1}


<surprise.model_selection.search.GridSearchCV at 0x2cafb917ac0>

In [3]:
'''
In this code, the data is split into training and testing sets using 
the train_test_split() function from surprise library. Then, an instance 
of the SVD algorithm is created with the best parameters obtained 
from the grid search, and it is trained on the training set using the fit() method.
'''
train_test_svd() #utils.training

RMSE: 1.3757
MAE:  1.0561
RMSE: 1.375737289908081
MAE: 1.056142242807008


## Getting recommendations

In [4]:
# We can get the recommendation as a dictionary
# We input the user ID for we want the recommendations
# Then the number of suggestions we have(we might get less if there not so many o none if there is no matches)
# Then the genre we want (or write "All")
# Then the type we want (or write "All")

create_dict_su(sort_it(25000),["Shounen"],["TV"],"or",10)

[{'name': 'Fullmetal Alchemist: Brotherhood',
  'english_title': 'Fullmetal Alchemist: Brotherhood',
  'japanses_title': '鋼の錬金術師 FULLMETAL ALCHEMIST',
  'genre': 'Shounen',
  'type': 'TV',
  'source': 'Manga',
  'duration': '24 min per ep',
  'episodes': '64',
  'rating': 'R - 17+ (violence & profanity)',
  'score': 9.11,
  'rank': 1.0,
  'members': 793665,
  'synopsis': 'After a horrific alchemy experiment goes wrong in the Elric household, brothers Edward and Alphonse are left in a catastrophic new reality. Ignoring the alchemical principle banning human transmutation, the boys attempted to bring their recently deceased mother back to life. Instead, they suffered brutal personal loss: Alphonse\'s body disintegrated while Edward lost a leg and then sacrificed an arm to keep Alphonse\'s soul in the physical realm by binding it to a hulking suit of armor.\r\n\r\nThe brothers are rescued by their neighbor Pinako Rockbell and her granddaughter Winry. Known as a bio-mechanical engineering 