# Unsupervised content based recommendation system

## Import Libraries

In [25]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# deal with sparse data libraries
from scipy.sparse import csr_matrix # Returns a copy of column i of the matrix, as a (m x 1) CSR matrix (column vector).

# visualization
#import seaborn as sns # data visualization library based on matplotlib.
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

## scikit Preprocessing data libraries
from sklearn.preprocessing import MinMaxScaler # Transform features by scaling each feature to a given range.

## scikit Feature Extraction libraries
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts.

## scikit Pairwise metrics libraries
#implements utilities to evaluate pairwise distances or affinity of sets of samples.
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel 

## scikit Cross validation iterators libraries
from sklearn.model_selection import GridSearchCV

# Unsupervised learner for implementing neighbor searches.
from sklearn.neighbors import NearestNeighbors

# setting display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)


# Get the current working directory
cwd = os.getcwd()

# Add the path of the utils directory to sys.path
utils_path = os.path.abspath(os.path.join(cwd, '..', 'utils'))
sys.path.append(utils_path)

# Utils libraries
from cleaning import *
from recommend import *
from testing import *
from training import *

#Preparing folder variables

main_folder = os.path.abspath(os.path.join(os.pardir))
data_folder = (main_folder + "/" +"data")
saved_models_folder = (data_folder + "/" + "saved_models")
raw_data = (data_folder + "/" + "_raw")
processed_data = (data_folder + "/" + "processed")
content_based_supervised_data = (main_folder + "/" + "processed" + "/" + "content_based_supervised")

## Cleaning and preparing the data

In [26]:
anime = pd.read_csv(raw_data + "/" + "anime.csv")
rating = pd.read_csv(raw_data + "/" + "rating.csv.zip")

In [27]:
print(clean_anime_df.__doc__)

The function clean_anime_df() takes an anime dataframe as input and performs several 
    cleaning and preprocessing steps, such as removing special characters from anime names, 
    converting all names to lowercase, filling missing values for "episodes" and "score" 
    columns with their median, dropping rows with null values for "genre" or "type" columns, 
    and saving the cleaned dataframe to a CSV file. The cleaned dataframe is also returned as output.


# Cleand data

Steps:
- Cambiamos a minúsculas todos los nombre de animes

In [28]:
# let's check the result of this cleaning process.
print(((anime.isnull().sum() / len(anime))*100).sort_values(ascending = False))
print(f"Total number of records: {len(anime)}")

score             14.048029
rank               9.572986
synopsis           2.475207
rating             1.614622
genre              0.499959
type               0.172117
japanses_title     0.155725
anime_id           0.000000
name               0.000000
english_title      0.000000
source             0.000000
duration           0.000000
episodes           0.000000
members            0.000000
cover              0.000000
dtype: float64
Total number of records: 12201


The columns rank, synopsis and japanses_title will only be use for showing the results of the recommendation, so we do not care about this 3 columns.

We will deal with the others

In [29]:
# Checking unique values in rating column
anime['rating'].str.split(',').explode().value_counts()

PG-13 - Teens 13 or older         4530
G - All Ages                      3415
Rx - Hentai                       1122
PG - Children                     1109
R+ - Mild Nudity                   930
R - 17+ (violence & profanity)     898
Name: rating, dtype: int64

In [30]:
# Checking unique values in rating column
anime['source'].str.split(',').explode().value_counts()

Manga           3106
Original        2953
Unknown         2777
Visual novel     934
Light novel      538
Game             522
Other            343
Novel            314
4-koma manga     207
Music            163
Web manga         96
Book              89
Picture book      86
Card game         40
Mixed media       23
Radio             10
Name: source, dtype: int64

In [31]:
# Checking unique values in rating column
anime['genre'].str.split(',').explode().value_counts()

Comedy            3172
Action            2828
 Sci-Fi           1969
 Fantasy          1800
 Shounen          1645
 Comedy           1439
Adventure         1438
 Romance          1364
 Kids             1198
 School           1165
 Slice of Life    1112
 Drama            1022
 Supernatural     1020
Drama              979
Hentai             945
 Adventure         885
 Mecha             815
 Magic             714
 Historical        633
 Shoujo            588
 Ecchi             570
 Seinen            532
 Sports            496
Fantasy            492
 Super Power       462
 Music             453
 Mystery           420
 Military          405
Music              402
Kids               394
 Parody            388
 Space             377
 Horror            294
 Harem             275
 Martial Arts      260
 Demons            209
 Psychological     206
Dementia           203
 Police            195
 Hentai            194
Historical         169
 Samurai           145
 Game              143
Mecha      

In [32]:
# Checking unique values in rating column
anime['source'].str.split(',').explode().value_counts()

Manga           3106
Original        2953
Unknown         2777
Visual novel     934
Light novel      538
Game             522
Other            343
Novel            314
4-koma manga     207
Music            163
Web manga         96
Book              89
Picture book      86
Card game         40
Mixed media       23
Radio             10
Name: source, dtype: int64

In [33]:
anime_cleaned = clean_anime_df(anime)# from cleaning.py
anime_cleaned.head(1)

Unnamed: 0,anime_id,name,english_title,japanses_title,genre,source,duration,episodes,rating,score,rank,members,synopsis,cover,type
0,1,cowboy bebop,Cowboy Bebop,カウボーイビバップ,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Original,24 min per ep,26,R - 17+ (violence & profanity),8.75,40.0,486824.0,"Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\r\n\r\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.\r\n\r\nWhile developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past. As a rival's maniacal plot continues to unravel, Spike must choose between life with his newfound family or revenge for his old wounds.\r\n\r\n[Written by MAL Rewrite]",https://cdn.myanimelist.net/images/anime/4/19644l.jpg,TV


In [34]:
# let's check the result of this cleaning process.
print(((anime_cleaned.isnull().sum() / len(anime_cleaned))*100).sort_values(ascending = False))
print(f"Total number of records: {len(anime_cleaned)}")

rank              9.572986
synopsis          2.475207
rating            1.614622
japanses_title    0.155725
anime_id          0.000000
name              0.000000
english_title     0.000000
genre             0.000000
source            0.000000
duration          0.000000
episodes          0.000000
score             0.000000
members           0.000000
cover             0.000000
type              0.000000
dtype: float64
Total number of records: 12201


In [35]:
anime_cleaned.shape

(12201, 15)

In [36]:
# Since the columns Sources is categorical, then we should use DecisionTreeClassifier. 
# This algorithm will create a decision tree model that can predict the categorical target variable.

In [37]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import accuracy_score
# 
# anime_cleaned = predict_source(anime_cleaned)

In [38]:
# Checking unique values in rating column
anime_cleaned['source'].str.split(',').explode().value_counts()

Manga           4436
Original        4215
Visual novel    1055
Light novel      560
Game             535
Other            347
Novel            320
4-koma manga     212
Music            163
Web manga        101
Book              96
Picture book      88
Card game         40
Mixed media       23
Radio             10
Name: source, dtype: int64

In [39]:
anime_cleaned.head(1)

Unnamed: 0,anime_id,name,english_title,japanses_title,genre,source,duration,episodes,rating,score,rank,members,synopsis,cover,type
0,1,cowboy bebop,Cowboy Bebop,カウボーイビバップ,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Original,24 min per ep,26,R - 17+ (violence & profanity),8.75,40.0,486824.0,"Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\r\n\r\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward Wong Hau Pepelu Tivrusky IV, and Ein, a bioengineered Welsh Corgi.\r\n\r\nWhile developing bonds and working to catch a colorful cast of criminals, the Bebop crew's lives are disrupted by a menace from Spike's past. As a rival's maniacal plot continues to unravel, Spike must choose between life with his newfound family or revenge for his old wounds.\r\n\r\n[Written by MAL Rewrite]",https://cdn.myanimelist.net/images/anime/4/19644l.jpg,TV


In [40]:
anime_features = prepare_supervised_content_based(anime_cleaned) # from cleaning.py

In [41]:
anime_features.head(1)

Unnamed: 0,episodes,score,members,Action,Adventure,Comedy,Drama,Dementia,Mecha,Historical,School,Hentai,Horror,Demons,Ecchi,Fantasy,Shounen,Game,Mystery,Cars,Magic,Romance,Sci-Fi,Harem,Kids,Shoujo,Military,Super Power,Martial Arts,Music,Slice of Life,Sports,Supernatural,Parody,Vampire,Psychological,Samurai,Yaoi,Seinen,Josei,Thriller,Space,Shounen Ai,Police,Yuri,Shoujo Ai,Movie,Music.1,ONA,OVA,Special,TV
0,26,8.75,486824.0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [42]:
anime_features.shape

(12201, 52)

In [43]:
min_max = MinMaxScaler()
min_max_features = min_max.fit_transform(anime_features)

In [44]:
min_max_features.shape

(12201, 52)

In [45]:
np.round(min_max_features,2)

array([[0.01, 0.95, 0.48, ..., 0.  , 0.  , 1.  ],
       [0.  , 0.9 , 0.14, ..., 0.  , 0.  , 0.  ],
       [0.01, 0.88, 0.28, ..., 0.  , 0.  , 1.  ],
       ...,
       [0.  , 0.64, 0.  , ..., 0.  , 0.  , 1.  ],
       [0.  , 0.63, 0.  , ..., 0.  , 0.  , 1.  ],
       [0.  , 0.63, 0.  , ..., 0.  , 0.  , 0.  ]])

## Finding the best parameters for NearestNeighbors model

In [46]:
param_NearestNeighbors(min_max_features)



{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'n_neighbors': 1,
 'p': 1,
 'radius': 0.0}

## Building model with the best parameters

In [47]:
model_NearestNeighbors(min_max_features)

array([[    0,   376,  2016, ...,  9644,  1281,    15],
       [    1,  7607,  3154, ...,  1190,  6662, 11631],
       [    2,  3940,  3409, ...,  7301,   209,  7156],
       ...,
       [12198,   626,  3292, ...,   926,  2040,  2027],
       [12199,  3603,  9892, ...,  7371,  4439,  7063],
       [12200,  8938,  9210, ...,  4030,  9857, 10645]], dtype=int64)

## Get recommendations

In [48]:
# We can get the recommendation as a dictionary
# We selec the name of the anime we want to find similitudes
# Then the genre we want (or write "All")
# Then the type we want (or write "All")
# Then the number of suggestions we have(we might get less if there not so many o none if there is no matches)

create_dict(print_similar_animes("Naruto"),["Shounen"],["TV"],"or",20)

I guess you misspelled the name
 Are you looking similitudes for the anime named [1mnaruto[0m? 
Here are the recommendations:
or


[{'name': 'eyeshield 21',
  'english_title': 'Eyeshield 21',
  'japanses_title': 'アイシールド21',
  'genre': 'Shounen',
  'type': 'TV',
  'source': 'Manga',
  'duration': '23 min per ep',
  'episodes': 145.0,
  'rating': 'PG-13 - Teens 13 or older',
  'score': 7.92,
  'rank': 702.0,
  'synopsis': 'Shy, reserved, and small-statured, Deimon High School student Sena Kobayakawa is the perfect target for bullies. However, as a result of running errands throughout his life, Sena has become agile and developed a skill for avoiding crowds of people. After the cunning Youichi Hiruma—captain of the Deimon Devil Bats football team—witnesses Sena\'s rapid legs in motion, he coerces the timid boy into joining his squad.\r\n\r\nAs Hiruma wants to conceal Sena\'s identity from other clubs, Sena is forced to hide under the visored helmet of "Eyeshield 21," a mysterious running back wearing the number 21 jersey. The legendary Eyeshield 21 can supposedly run at the speed of light and has achieved remarkable 