# Supervised user based collaborative filtering

## Import Libraries

In [2]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# visualization
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

# Surprise libraries
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Utils libraries
from utils import cleaning
from utils import recommend
from utils import testing
from utils import training

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

## Loading and cleaning data

In [3]:
# loading the data
anime = anime = pd.read_csv(raw_data + "/" + "anime.csv")
rating = rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

In [4]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
# Cleaning the data
#def supervised_rating_cleaning(rating):
#    ratingdf = rating[rating.rating>0]
#    ratingdf = ratingdf.reset_index()
#    ratingdf.drop('index', axis=1,inplace=True)
#    return ratingdf
ratingdf = cleaning.supervised_rating_cleaning(rating)

## Preparing the data to try different models

In [6]:
def supervised_prepare_training(ratingdf):
    # using groupby and some fancy logic
    reader = Reader(rating_scale=(1,10))
    data = Dataset.load_from_df(ratingdf[['user_id', 'anime_id', 'rating']], reader)
    
    size = 100000
    rating_sample = ratingdf.groupby("rating", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(ratingdf))))).sample(frac=1).reset_index(drop=True)
    
    # Saving the table to pickle
    joblib.dump(data,content_based_supervised_data + "/" + "rating_sample.pkl")

    reader = Reader(rating_scale=(1,10))
    data_sample = Dataset.load_from_df(rating_sample[['user_id', 'anime_id', 'rating']], reader)

    # Saving the table to pickle
    joblib.dump(data,content_based_supervised_data + "/" + "data_sample.pkl")

    return data_sample
data_sample = supervised_prepare_training(ratingdf)

In [None]:
data_sample = training.supervised_prepare_training(ratingdf)

## Metrics all together

In [None]:
def baseline_all():
    
    benchmark = []
    # Iterate over all algorithms
    svd = SVD()
    svdp = SVDpp()
    slpo = SlopeOne()
    nm  = NMF()
    nmlp = NormalPredictor()
    #knnbase = KNNBaseline()
    #knnb = KNNBasic()
    #knnmean = KNNWithMeans()
    #knnzs = KNNWithZScore()
    baseonly = BaselineOnly()
    coclus = CoClustering()

    for algorithm in [svd,svdp,slpo,nm,nmlp,baseonly,coclus]:
        benchmark_inndividual = []
        print(algorithm,"started")
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE','MSE','MAE','FCP'], cv=3, verbose=False)
        print(algorithm,"finished")
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        name = str(algorithm).split(' ')[0].split('.')[-1]
        tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
        benchmark_inndividual.append(tmp)
        benchmark.append(tmp)
        
        dfscores_individual = pd.DataFrame(benchmark_inndividual).set_index('Algorithm').sort_values('test_rmse')
        write(saved_models_folder + "/" + name + "_results.parq", dfscores_individual)
    dfscores = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    write(saved_models_folder + "/" + "Others_Models_results.parq", dfscores)

    return dfscores


## Merge df resutls

In [None]:
df_others_results = pd.read_parquet(saved_models_folder + "/" + "Others_Models_results.parq", engine='fastparquet')
df_others_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785


In [None]:
df_KNNBasic_results = pd.read_parquet(saved_models_folder + "/" + "KNNBasic_results.parq", engine='fastparquet')
df_KNNBasic_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444


In [None]:
df_KNNBaseline_results = pd.read_parquet(saved_models_folder + "/" + "KNNBaseline_results.parq", engine='fastparquet')
df_KNNBaseline_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859


In [None]:
df_knn_results = pd.read_parquet(saved_models_folder + "/" + "KNN_Models_results.parq", engine='fastparquet')
df_knn_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209
KNNWithZScore,1.666752,2.778087,1.267275,0.468789,36.883558,2.03843


In [None]:
vertical_concat = pd.concat([df_others_results, df_KNNBasic_results,df_KNNBaseline_results,df_knn_results], axis=0)

In [None]:
vertical_concat.head(20)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209


In [None]:
listatests =  ["test_rmse","test_mse","test_mae","test_fcp"]
for i in listatests:
    print ("the best result in",i,"is",vertical_concat.iloc[vertical_concat[i].argmin(), 0:1])

the best result in test_rmse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mae is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_fcp is test_rmse    1.701391
Name: SlopeOne, dtype: float64


## Evaluation selected model SVD

In [16]:
def evaluate_svd_model(data_sample):
    param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
    gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

    gs.fit(data_sample)
    params = gs.best_params['rmse']

    # best RMSE score
    print(gs.best_score["rmse"])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params["rmse"])

    # We can now use the algorithm that yields the best rmse:
    algo = gs.best_estimator["rmse"]
    algo.fit(data_sample.build_full_trainset())

    # # Serialización del modelo
    joblib.dump(algo,saved_models_folder + "/" + "SVD_samople_fit.pkl")

    results_df = pd.DataFrame.from_dict(gs.cv_results)
    
    return results_df


In [None]:
testing.evaluate_svd_model(data_sample)

In [16]:
anime = pd.read_csv(data_folder + "/" + "anime.csv")
anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)
## Recomendacion by user Id and how many results
def reco_by_user(id,n):
    que_user = id
    chosen_user = anime_mapping.copy()
    chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: algo.predict(que_user, x).est)

    chosen_user = chosen_user.drop('anime_id', axis = 1)

    chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


    to_return = chosen_user.head(n)

    return to_return

reco_by_user(208,20)

Unnamed: 0,name,genre,type,Estimate_Score
9,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.725608
7,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,9.676801
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.605772
17,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",TV,9.569785
3,Steins;Gate,"Sci-Fi, Thriller",TV,9.564523
38,Monster,"Drama, Horror, Mystery, Police, Psychological, Seinen, Thriller",TV,9.492835
8,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",Movie,9.490081
6,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,9.444717
12,Gintama,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.419202
18,Ookami Kodomo no Ame to Yuki,"Fantasy, Slice of Life",Movie,9.367766


In [21]:
def anime():
    anime = pd.read_csv(raw_data + "/" + "anime.csv")
    return anime

In [2]:
from utils import recommend
recommend.reco_by_user(675,50,"Supernatural","All") #reco_by_user(1000,50,"Supernatural","All")

AttributeError: 'NoneType' object has no attribute 'copy'

In [7]:
import pandas as pd
def map_it():
    anime = pd.read_csv(raw_data + "/" + "anime.csv")
    anime_mapping = anime.copy()
    anime_mapping = anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)
    ## Recomendacion by user Id, how many results and gender
    return anime_mapping
map_it()

In [6]:
## Recomendacion by user Id, how many results and gender
'''
Create a df of the anime matches with the filters selected
'''
def df_recommendation(id,n,gen,typ):
    final_df = reco_by_user(id,n,gen,typ)
    to_return = final_df
    blankIndex=[''] * len(final_df)
    final_df.index=blankIndex
    if final_df.empty:
        sentence = print('WOW!!!! Sorry, there is no matches for the anime and options selected! \n Try again, you might have mroe luck')
        return sentence
    else:
        return to_return

def dict_recommendation(id,n,gen,typ):
    final_df = reco_by_user(id,n,gen,typ)
    to_return = final_df
    blankIndex=[''] * len(final_df)
    final_df.index=blankIndex
    if final_df.empty:
        sentence = print('WOW!!!! Sorry, there is no matches for the anime and options selected! \n Try again, you might have mroe luck')
        return sentence
    else:
        final_dict = final_df.to_dict('records')
        return final_dict

def sort_it(que_user,df,n):
    algo = joblib.load(saved_models_folder + "/" + "SVD_samople_fit.pkl")

    df['Estimate_Score'] = df['anime_id'].apply(lambda x: algo.predict(que_user, x).est)
    df = df.sort_values('Estimate_Score', ascending=False).drop(['anime_id'], axis = 1)
    blankIndex=[''] * len(df)
    df.index=blankIndex 
    return df.head(n)

def reco_by_user(id,n,gen,typ):
    anime = pd.read_csv(raw_data + "/" + "anime.csv")
    anime_map = anime.copy()
    anime_map.drop(['episodes','members','rating'],axis=1, inplace=True)
    chosen_user = anime_map.copy()

    if (gen != "All") and (typ != "All"):
        filtered = chosen_user[chosen_user['genre'].str.contains(gen, regex=False, case=False, na=False)]
        filtered = filtered[filtered['type'].str.contains(typ, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (gen == "All") and (typ != "All"):
        filtered = chosen_user[chosen_user['type'].str.contains(typ, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (typ == "All") and (gen != "All"):
        filtered = chosen_user[chosen_user['genre'].str.contains(gen, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (typ == "All") and (gen == "All"):
        return sort_it(id,chosen_user,n)

df_recommendation(675,50,"Supernatural","All")

Unnamed: 0,name,genre,type,Estimate_Score
,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.30818
,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supernatural",TV,9.015971
,Hellsing Ultimate,"Action, Horror, Military, Seinen, Supernatural, Vampire",OVA,8.962239
,Monogatari Series: Second Season,"Comedy, Mystery, Romance, Supernatural, Vampire",TV,8.796475
,Natsume Yuujinchou San,"Drama, Fantasy, Shoujo, Slice of Life, Supernatural",TV,8.774697
,Suzumiya Haruhi no Shoushitsu,"Comedy, Mystery, Romance, School, Sci-Fi, Supernatural",Movie,8.757496
,Fate/Zero,"Action, Fantasy, Supernatural",TV,8.749244
,Shinsekai yori,"Drama, Horror, Mystery, Sci-Fi, Supernatural",TV,8.744212
,Natsume Yuujinchou Shi,"Drama, Fantasy, Shoujo, Slice of Life, Supernatural",TV,8.706689
,Mushishi,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",TV,8.705345


## Evaluation selected SVD model (all dataset)

In [18]:
anime = pd.read_csv(data_folder + "/" + "anime.csv")
rating = pd.read_csv(data_folder + "/" + "rating.csv.zip")

anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratingdf[['user_id', 'anime_id', 'rating']], reader)


In [19]:
from fastparquet import write 
from surprise import SVD,NormalPredictor
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)


gs.fit(data)
params = gs.best_params['rmse']

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

import pandas as pd  # noqa

results_df = pd.DataFrame.from_dict(gs.cv_results)

write(saved_models_folder + "/" + "SVD_full_results.parq", results_df)

#1.1341632727982356
#{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}

# # Serialización del modelo
import pickle
joblib.dump(algo,saved_models_folder + "/" + "SVD_full_fit.pkl")



## Recommendation building phase SVD model using best_params

In [None]:
algo = joblib.load(saved_models_folder + "/" + "SVD_full_fit.pkl")

In [None]:
import joblib
anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)
#anime_mapping.set_index('anime_id',inplace=True) 


from surprise import SVD
algo = 
trainset = data.build_full_trainset()
algo.fit(trainset)

NameError: name 'algo' is not defined

## Getting recommendations

In [None]:
que_user = 208
chosen_user = anime_mapping.copy()
chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: algo.predict(que_user, x).est)

chosen_user = chosen_user.drop('anime_id', axis = 1)

chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


chosen_user.head(10)

Unnamed: 0,name,genre,type,Estimate_Score
11,Koe no Katachi,"Drama, School, Shounen",Movie,9.364396
480,Mahou Shoujo Lyrical Nanoha: The Movie 1st,"Action, Comedy, Drama, Magic, Super Power",Movie,9.350239
10,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supernatural",TV,9.344733
42,Ansatsu Kyoushitsu (TV) 2nd Season,"Action, Comedy, School, Shounen",TV,9.246364
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.150686
373,K-On!!,"Comedy, Music, School, Slice of Life",TV,9.150479
16,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,9.121163
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.118466
5,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,"Comedy, Drama, School, Shounen, Sports",TV,9.112615
194,Mahou Shoujo Lyrical Nanoha: The Movie 2nd A&#039;s,"Action, Comedy, Drama, Magic, Super Power",Movie,9.089225


In [3]:
que_user = 58145
chosen_user = anime_mapping.copy()
chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: algo.predict(que_user, x).est)

chosen_user = chosen_user.drop('anime_id', axis = 1)

chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


chosen_user.head(10)

NameError: name 'anime_mapping' is not defined