# Supervised user based collaborative filtering

## Import Libraries

In [2]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# visualization
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")



## Loading and cleaning data

In [17]:
# loading the data
anime = anime()
rating = rating()

In [18]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [19]:
rating.shape

(7813737, 3)

In [20]:
rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [21]:
# Cleaning the data
ratingdf = supervised_rating_cleaning(rating)

## Preparing the data to try different models

In [22]:
data_sample = supervised_prepare_training(ratingdf)

## Metrics all together

In [None]:
def baseline_all(data):
    
    benchmark = []
    # Iterate over all algorithms
    svd = SVD()
    svdp = SVDpp()
    slpo = SlopeOne()
    nm  = NMF()
    nmlp = NormalPredictor()
    #knnbase = KNNBaseline()
    #knnb = KNNBasic()
    #knnmean = KNNWithMeans()
    #knnzs = KNNWithZScore()
    baseonly = BaselineOnly()
    coclus = CoClustering()

    for algorithm in [svd,svdp,slpo,nm,nmlp,baseonly,coclus]:
        benchmark_inndividual = []
        print(algorithm,"started")
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE','MSE','MAE','FCP'], cv=3, verbose=False)
        print(algorithm,"finished")
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        name = str(algorithm).split(' ')[0].split('.')[-1]
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark_inndividual.append(tmp)
        benchmark.append(tmp)
        
        dfscores_individual = pd.DataFrame(benchmark_inndividual).set_index('Algorithm').sort_values('test_rmse')
        write(saved_models_folder + "/" + name + "_results.parq", dfscores_individual)
    dfscores = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    write(saved_models_folder + "/" + "Others_Models_results.parq", dfscores)

    return dfscores

In [None]:
baseline_all(data_sample)

## Merge df resutls

In [None]:
df_others_results = pd.read_parquet(saved_models_folder + "/" + "Others_Models_results.parq", engine='fastparquet')
df_others_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785


In [None]:
df_KNNBasic_results = pd.read_parquet(saved_models_folder + "/" + "KNNBasic_results.parq", engine='fastparquet')
df_KNNBasic_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444


In [None]:
df_KNNBaseline_results = pd.read_parquet(saved_models_folder + "/" + "KNNBaseline_results.parq", engine='fastparquet')
df_KNNBaseline_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859


In [None]:
df_knn_results = pd.read_parquet(saved_models_folder + "/" + "KNN_Models_results.parq", engine='fastparquet')
df_knn_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209
KNNWithZScore,1.666752,2.778087,1.267275,0.468789,36.883558,2.03843


In [None]:
vertical_concat = pd.concat([df_others_results, df_KNNBasic_results,df_KNNBaseline_results,df_knn_results], axis=0)

In [None]:
vertical_concat.head(20)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209


In [None]:
listatests =  ["test_rmse","test_mse","test_mae","test_fcp"]
for i in listatests:
    print ("the best result in",i,"is",vertical_concat.iloc[vertical_concat[i].argmin(), 0:1])

the best result in test_rmse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mae is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_fcp is test_rmse    1.701391
Name: SlopeOne, dtype: float64


## Evaluation selected SVD model

In [7]:
anime = anime = pd.read_csv(raw_data + "/" + "anime.csv")
rating = rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

size = 100000
ratingdf = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratingdf[['user_id', 'anime_id', 'rating']], reader)


In [13]:
from fastparquet import write 
from surprise import SVD,NormalPredictor
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)


gs.fit(data)
best_params  = gs.best_params['rmse']

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

import pandas as pd  # noqa

results_df = pd.DataFrame.from_dict(gs.cv_results)

write(saved_models_folder + "/" + "results_SVD_sample_fit.parq", results_df)

#1.1341632727982356
#{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}

# # Serialización del modelo
import pickle
joblib.dump(algo,saved_models_folder + "/" + "SVD_sample_fit.pkl")



1.3850769451177865
{'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


['d:\\Github\\Anime_recommendation_system\\src/data/saved_models/SVD_samople_fit.pkl']

## Recommendation building phase SVD model using best_params

In [14]:
# Create a new SVD model using the best parameters
best_svd = SVD(n_factors=best_params['n_factors'], n_epochs=best_params['n_epochs'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])

best_svd.fit(data)

# Fit the new SVD model to the dataset
trainset = data.build_full_trainset()
best_svd.fit(data)

# # Serialización del modelo
import pickle
joblib.dump(best_svd,saved_models_folder + "/" + "SVD_final_fit.pkl")


AttributeError: 'DatasetAutoFolds' object has no attribute 'n_users'

## Getting recommendations

In [None]:
# We can get the recommendation as a dictionary
# We input the user ID for we want the recommendations
# Then the number of suggestions we have(we might get less if there not so many o none if there is no matches)
# Then the genre we want (or write "All")
# Then the type we want (or write "All")

create_dict_su(sort_it(25000),["Shounen"],["TV"],"or",10)

In [17]:
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy

anime = anime = pd.read_csv(raw_data + "/" + "anime.csv")
rating = rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

size = 100000
ratingdf_sample = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratingdf_sample[['user_id', 'anime_id', 'rating']], reader)

# Define parameter grid for grid search
param_grid = {'n_factors': [50, 100, 150], 
              'n_epochs': [20, 30, 40], 
              'lr_all': [0.002, 0.005, 0.01],
              'reg_all': [0.02, 0.05, 0.1]}

In [18]:
# Create GridSearchCV object with SVD algorithm
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

# Fit GridSearchCV object to data
gs.fit(data)

# Print best RMSE and MAE scores, as well as corresponding parameters
print("Best RMSE score:", gs.best_score['rmse'])
print("Best MAE score:", gs.best_score['mae'])
print("Best parameters for RMSE:", gs.best_params['rmse'])
print("Best parameters for MAE:", gs.best_params['mae'])

Best RMSE score: 1.3821686121611811
Best MAE score: 1.0672782540321568
Best parameters for RMSE: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}
Best parameters for MAE: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [19]:
# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD algorithm on training set with best parameters
best_params = SVD(n_factors=gs.best_params['rmse']['n_factors'], 
           n_epochs=gs.best_params['rmse']['n_epochs'], 
           lr_all=gs.best_params['rmse']['lr_all'], 
           reg_all=gs.best_params['rmse']['reg_all'])
best_params.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2e77c174340>

In [20]:
# # Serialización del modelo
import pickle
joblib.dump(best_params,saved_models_folder + "/" + "SVD_sample_fit.pkl")

['d:\\Github\\Anime_recommendation_system\\src/data/saved_models/SVD_sample_fit.pkl']

In [21]:
# Make predictions on testing set
predictions = algo.test(testset)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

print("RMSE:", rmse)
print("MAE:", mae)

RMSE: 1.3433
MAE:  1.0321
RMSE: 1.3433194514421487
MAE: 1.0320873653580798


In [3]:
create_dict_su(sort_it(25000),["Shounen"],["TV"],"or",10)

[{'name': 'Gintama',
  'english_title': 'Gintama',
  'japanses_title': '銀魂',
  'genre': 'Shounen',
  'type': 'TV',
  'source': 'Manga',
  'duration': '24 min per ep',
  'episodes': '201',
  'rating': 'PG-13 - Teens 13 or older',
  'score': 8.94,
  'rank': 17.0,
  'members': 336376,
  'synopsis': 'Edo is a city that was home to the vigor and ambition of samurai across the country. However, following feudal Japan\'s surrender to powerful aliens known as the "Amanto," those aspirations now seem unachievable. With the once-influential shogunate rebuilt as a puppet government, a new law is passed that promptly prohibits all swords in public. \r\n\r\nEnter Gintoki Sakata, an eccentric silver-haired man who always carries around a wooden sword and maintains his stature as a samurai despite the ban. As the founder of Yorozuya, a small business for odd jobs, Gintoki often embarks on endeavors to help other people—though usually in rather strange and unforeseen ways. \r\n\r\nAssisted by Shinpach