# All libraries

In [1]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# visualization
import seaborn as sns # data visualization library based on matplotlib.
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB, MultinomialNB

## scikit modeling libraries
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV # scikit Cross validation iterators libraries

## Load metrics for predictive modeling
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

## scikit Pairwise metrics libraries
#implements utilities to evaluate pairwise distances or affinity of sets of samples.
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel 

# pip install git+https://github.com/NicolasHug/surprise.git

# Surprise libraries
from surprise import Dataset, Reader, accuracy

from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Utils libraries
from utils import utils
from utils import cleaning
from utils import recommend
from utils import testing
from utils import training

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")




# Collaborative filtering recommendation using SVD in Surprise framework. The recommendation is based on user preference data

## libraries and data

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import os
import re
import sys

import warnings


# pip install git+https://github.com/NicolasHug/surprise.git

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")

from surprise.model_selection import cross_validate
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

from surprise import Dataset,Reader,SVD,SVDpp,SlopeOne,NMF,NormalPredictor,KNNBaseline,KNNBasic,KNNWithMeans,KNNWithZScore,BaselineOnly,CoClustering
from playsound import playsound
#Preparing folder variables

from playsound import playsound
playsound("src\sounds\Finished.mp3")

In [40]:
anime = pd.read_csv(data_folder + "/" + "anime.csv")
rating = pd.read_csv(data_folder + "/" + "rating.csv.zip")

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

# performing Stratified Sampling With Pandas and Numpy to have a representative sample of the main df

import pandas as pd
import numpy as np

# lets say we wanted to shrink our data frame down to 125 rows, 
# with the same
# target class distribution

size = 100000


# using groupby and some fancy logic

rating_sample = ratingdf.groupby("rating", group_keys=False)\
                        .apply(lambda x: \
                         x.sample(int(np.rint(size*len(x)/len(ratingdf)))))\
                        .sample(frac=1).reset_index(drop=True)



# we can see that our sample is 199999 rows, 
# while our distribution follows as close as possible

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(rating_sample[['user_id', 'anime_id', 'rating']], reader)


## Metrics all together

In [None]:
from fastparquet import write 

benchmark = []
# Iterate over all algorithms
svd = SVD()
svdp = SVDpp()
slpo = SlopeOne()
nm  = NMF()
nmlp = NormalPredictor()
knnbase = KNNBaseline()
knnb = KNNBasic()
knnmean = KNNWithMeans()
knnzs = KNNWithZScore()
baseonly = BaselineOnly()
coclus = CoClustering()

for algorithm in [svd,svdp,slpo,nm,nmlp,knnbase,knnb,knnmean,knnzs,baseonly,coclus]:
    benchmark_inndividual = []
    print(algorithm,"started")
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE','MSE','MAE','FCP'], cv=3, verbose=False)
    print(algorithm,"finished")
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    name = str(algorithm).split(' ')[0].split('.')[-1]
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark_inndividual.append(tmp)
    benchmark.append(tmp)
    
    dfscores_individual = pd.DataFrame(benchmark_inndividual).set_index('Algorithm').sort_values('test_rmse')
    write(saved_models_folder + "/" + name + "_results.parq", dfscores_individual)
dfscores = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
write(saved_models_folder + "/" + "Others_Models_results.parq", dfscores)
playsound("src\sounds\Finished.mp3")

## Merge df resutls

In [16]:
df_others_results = pd.read_parquet(saved_models_folder + "/" + "Others_Models_results.parq", engine='fastparquet')
df_others_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785


In [17]:
df_KNNBasic_results = pd.read_parquet(saved_models_folder + "/" + "KNNBasic_results.parq", engine='fastparquet')
df_KNNBasic_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444


In [18]:
df_KNNBaseline_results = pd.read_parquet(saved_models_folder + "/" + "KNNBaseline_results.parq", engine='fastparquet')
df_KNNBaseline_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859


In [19]:
df_knn_results = pd.read_parquet(saved_models_folder + "/" + "KNN_Models_results.parq", engine='fastparquet')
df_knn_results.head(10)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209
KNNWithZScore,1.666752,2.778087,1.267275,0.468789,36.883558,2.03843


In [20]:
vertical_concat = pd.concat([df_others_results, df_KNNBasic_results,df_KNNBaseline_results,df_knn_results], axis=0)

In [21]:
vertical_concat.head(20)

Unnamed: 0_level_0,test_rmse,test_mse,test_mae,test_fcp,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SVD,1.410689,1.990173,1.091799,0.54983,1.127742,0.291177
SVDpp,1.413802,1.998886,1.098086,0.548991,1.091193,0.738526
BaselineOnly,1.423057,2.025098,1.101095,0.555097,0.32587,0.144077
CoClustering,1.582529,2.504417,1.207506,0.566946,4.946316,0.224878
SlopeOne,1.701391,2.894736,1.294262,0.458307,0.578457,0.31063
NormalPredictor,2.139694,4.578297,1.696845,0.4974,0.116917,0.156717
NMF,2.499246,6.246258,2.119292,0.553087,3.48473,0.298785
KNNBasic,1.643655,2.701632,1.275451,0.462612,35.4829,1.732444
KNNBaseline,1.498532,2.245621,1.156121,0.53585,32.409142,1.71859
KNNWithMeans,1.65163,2.727919,1.25824,0.466116,31.944902,1.796209


In [22]:
listatests =  ["test_rmse","test_mse","test_mae","test_fcp"]
for i in listatests:
    print ("the best result in",i,"is",vertical_concat.iloc[vertical_concat[i].argmin(), 0:1])

the best result in test_rmse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mse is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_mae is test_rmse    1.410689
Name: SVD, dtype: float64
the best result in test_fcp is test_rmse    1.701391
Name: SlopeOne, dtype: float64


## Evaluation selected model SVD

In [2]:
anime = pd.read_csv(data_folder + "/" + "anime.csv")
rating = pd.read_csv(data_folder + "/" + "rating.csv.zip")

anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

# performing Stratified Sampling With Pandas and Numpy to have a representative sample of the main df

import pandas as pd
import numpy as np

# lets say we wanted to shrink our data frame down to 125 rows, 
# with the same
# target class distribution

size = 100000


# using groupby and some fancy logic

rating_sample = ratingdf.groupby("rating", group_keys=False)\
                        .apply(lambda x: \
                         x.sample(int(np.rint(size*len(x)/len(ratingdf)))))\
                        .sample(frac=1).reset_index(drop=True)



# we can see that our sample is 199999 rows, 
# while our distribution follows as close as possible

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(rating_sample[['user_id', 'anime_id', 'rating']], reader)


In [11]:
from fastparquet import write 
from surprise import SVD,NormalPredictor
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)


gs.fit(data)
params = gs.best_params['rmse']

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

import pandas as pd  # noqa

results_df = pd.DataFrame.from_dict(gs.cv_results)


1.3770430362065416
{'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [13]:
# # Serialización del modelo
import pickle
joblib.dump(algo,saved_models_folder + "/" + "SVD_fit.pkl")

['c:\\Users\\christiandda\\Documents\\GitHub\\Anime_recommendation_systems-1\\src\\data\\saved_models/SVD_fit.pkl']

In [3]:
# Loading modelo
algo = joblib.load(saved_models_folder + "/" + "SVD_fit.pkl")

In [None]:
# import joblib
# anime_mapping = anime.copy()
# anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)
# #anime_mapping.set_index('anime_id',inplace=True) 
# 
# 
# from surprise import SVD
# trainset = data.build_full_trainset()
# svd.fit(trainset)
# 
# # Serialización del modelo
# joblib.dump(svd,saved_models_folder + "/" + "svd_fit_trained.pkl")

In [104]:
## Recomendacion by user Id and how many results
def reco_by_user(id,n):
    que_user = id
    chosen_user = anime_mapping.copy()
    chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: svd.predict(que_user, x).est)

    chosen_user = chosen_user.drop('anime_id', axis = 1)

    chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


    chosen_user.head(n)

reco_by_user(208,20)

Unnamed: 0,name,genre,type,Estimate_Score
73,Major S5,"Comedy, Drama, Romance, Sports",TV,10.0
720,The Law of Ueki,"Adventure, Comedy, Drama, Shounen, Supernatural",TV,10.0
250,Zankyou no Terror,"Psychological, Thriller",TV,10.0
518,Kokoro Connect,"Comedy, Drama, Romance, School, Slice of Life, Supernatural",TV,10.0
118,No Game No Life,"Adventure, Comedy, Ecchi, Fantasy, Game, Supernatural",TV,10.0
251,Darker than Black: Kuro no Keiyakusha,"Action, Mystery, Sci-Fi, Super Power",TV,10.0
738,Soukou Kihei Votoms,"Action, Drama, Mecha, Military, Sci-Fi, Space",TV,10.0
350,Major Movie: Yuujou no Winning Shot,"Action, Shounen, Sports",Movie,10.0
12,Gintama,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,10.0
777,Change!! Getter Robo: Sekai Saigo no Hi,"Action, Adventure, Horror, Mecha, Psychological, Sci-Fi, Shounen",OVA,10.0


In [60]:
filtered.columns

NameError: name 'filtered' is not defined

In [9]:
## Recomendacion by user Id, how many results and gender
def sort_it(que_user,df,n):
    df['Estimate_Score'] = df['anime_id'].apply(lambda x: algo.predict(que_user, x).est)
    df = df.sort_values('Estimate_Score', ascending=False).drop(['anime_id'], axis = 1)
    blankIndex=[''] * len(df)
    df.index=blankIndex 
    return df.head(n)

def reco_by_user(id,n,gen,typ):
    chosen_user = anime_mapping.copy()

    if (gen != "All") and (typ != "All"):
        filtered = chosen_user[chosen_user['genre'].str.contains(gen, regex=False, case=False, na=False)]
        filtered = filtered[filtered['type'].str.contains(typ, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (gen == "All") and (typ != "All"):
        filtered = chosen_user[chosen_user['type'].str.contains(typ, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (typ == "All") and (gen != "All"):
        filtered = chosen_user[chosen_user['genre'].str.contains(gen, regex=False, case=False, na=False)]
        return sort_it(id,filtered,n)

    elif  (typ == "All") and (gen == "All"):
        return sort_it(id,chosen_user,n)


reco_by_user(675,50,"Supernatural","All") #reco_by_user(1000,50,"Supernatural","All")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Estimate_Score'] = df['anime_id'].apply(lambda x: algo.predict(que_user, x).est)


Unnamed: 0,name,genre,type,Estimate_Score
,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.696857
,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supernatural",TV,9.544883
,Mushishi,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",TV,9.301881
,Kara no Kyoukai 7: Satsujin Kousatsu (Kou),"Action, Mystery, Romance, Supernatural, Thriller",Movie,9.250269
,Monogatari Series: Second Season,"Comedy, Mystery, Romance, Supernatural, Vampire",TV,9.104399
,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",TV,9.096138
,Suzumiya Haruhi no Shoushitsu,"Comedy, Mystery, Romance, School, Sci-Fi, Supernatural",Movie,9.086084
,Mushishi Zoku Shou: Suzu no Shizuku,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",Movie,9.081327
,JoJo no Kimyou na Bouken (TV),"Action, Adventure, Shounen, Supernatural, Vampire",TV,9.061229
,Fate/Zero 2nd Season,"Action, Fantasy, Supernatural, Thriller",TV,9.056614


## Evaluation selected SVD model (all dataset)

In [8]:
anime = pd.read_csv(data_folder + "/" + "anime.csv")
rating = pd.read_csv(data_folder + "/" + "rating.csv.zip")

anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)

ratingdf = rating[rating.rating>0]
ratingdf = ratingdf.reset_index()
ratingdf.drop('index', axis=1,inplace=True)
ratingdf.shape

#print(rating_sample["rating"].value_counts(normalize=True),f'\n\nlength of data {rating_sample.shape}')

reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(ratingdf[['user_id', 'anime_id', 'rating']], reader)


In [107]:
from fastparquet import write 
from surprise import SVD,NormalPredictor
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)


gs.fit(data)
params = gs.best_params['rmse']

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator["rmse"]
algo.fit(data.build_full_trainset())

import pandas as pd  # noqa

results_df = pd.DataFrame.from_dict(gs.cv_results)

write(saved_models_folder + "/" + "SVD_full_results.parq", results_df)
playsound("src\sounds\Finished.mp3")

#1.1341632727982356
#{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}

# # Serialización del modelo
import pickle
joblib.dump(algo,saved_models_folder + "/" + "SVD_full_fit.pkl")



1.1341632727982356
{'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [None]:
algo = joblib.load(saved_models_folder + "/" + "SVD_full_fit.pkl")

In [9]:
import joblib
anime_mapping = anime.copy()
anime_mapping.drop(['episodes','members','rating'],axis=1, inplace=True)
#anime_mapping.set_index('anime_id',inplace=True) 


from surprise import SVD
algo = 
trainset = data.build_full_trainset()
algo.fit(trainset)

NameError: name 'algo' is not defined

In [113]:
que_user = 208
chosen_user = anime_mapping.copy()
chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: algo.predict(que_user, x).est)

chosen_user = chosen_user.drop('anime_id', axis = 1)

chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


chosen_user.head(10)

Unnamed: 0,name,genre,type,Estimate_Score
11,Koe no Katachi,"Drama, School, Shounen",Movie,9.364396
480,Mahou Shoujo Lyrical Nanoha: The Movie 1st,"Action, Comedy, Drama, Magic, Super Power",Movie,9.350239
10,Clannad: After Story,"Drama, Fantasy, Romance, Slice of Life, Supernatural",TV,9.344733
42,Ansatsu Kyoushitsu (TV) 2nd Season,"Action, Comedy, School, Shounen",TV,9.246364
0,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,9.150686
373,K-On!!,"Comedy, Music, School, Slice of Life",TV,9.150479
16,Shigatsu wa Kimi no Uso,"Drama, Music, Romance, School, Shounen",TV,9.121163
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.118466
5,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,"Comedy, Drama, School, Shounen, Sports",TV,9.112615
194,Mahou Shoujo Lyrical Nanoha: The Movie 2nd A&#039;s,"Action, Comedy, Drama, Magic, Super Power",Movie,9.089225


In [114]:
que_user = 58145
chosen_user = anime_mapping.copy()
chosen_user['Estimate_Score'] = chosen_user['anime_id'].apply(lambda x: algo.predict(que_user, x).est)

chosen_user = chosen_user.drop('anime_id', axis = 1)

chosen_user = chosen_user.sort_values('Estimate_Score', ascending=False)


chosen_user.head(10)

Unnamed: 0,name,genre,type,Estimate_Score
7,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,9.588039
38,Monster,"Drama, Horror, Mystery, Police, Psychological, Seinen, Thriller",TV,9.154705
4,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.154511
2,Gintama°,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.086198
12,Gintama,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.045307
17,Mushishi Zoku Shou 2nd Season,"Adventure, Fantasy, Historical, Mystery, Seinen, Slice of Life, Supernatural",TV,9.02412
6,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,9.020174
9,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,9.009297
50,Yojouhan Shinwa Taikei,"Mystery, Psychological, Romance",TV,8.973086
111,Ashita no Joe 2,"Drama, Sports",TV,8.958461


## Opening svd_fit_trained.pkl

In [None]:
fichero = open(saved_models_folder + "/" + "svd_fit_trained.pkl","rb")
algo = pickle.load(fichero)fichero.close()

UnpicklingError: invalid load key, '\x02'.

### ‘Did you mean…?’ Trick

We often misspell a movie title. When we make misspellings while using Google to search something, Google asks us, ‘Did you mean…?’ in order to help our search. I apply Levenshtein Distance in order to implement this trick to the recommendation engine. This is a technique to calculate the distance between words. The fuzz class in fuzzywuzzy library can be used to implement the Levenshtein Distance in Python.

fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and returns the score for the distance. If the two words, a and b, are exactly the same, the score becomes 100. As the distance between the words increases, the score falls.

The function find_closest_title() is supposed to return the most similar title in the data to the words a user types. Without this, the recommender only works when a user enters the exact title which the data has.