In [1]:
# Standard library imports
import os # allows access to OS-dependent functionalities
import re #  regular expression matching operations similar to those found in Perl
import sys # to manipulate different parts of the Python runtime environment
import warnings # is used to display the message Warning
import pickle # serializing and deserializing a Python object structure.

# Third party libraries
from fastparquet import write # parquet format, aiming integrate into python-based big data work-flows
from fuzzywuzzy import fuzz # used for string matching

import numpy as np # functions for working in domain of linear algebra, fourier transform, matrices and arrays
import pandas as pd # data analysis and manipulation tool
import joblib # set of tools to provide lightweight pipelining in Python

# deal with sparse data libraries
from scipy.sparse import csr_matrix # Returns a copy of column i of the matrix, as a (m x 1) CSR matrix (column vector).

# visualization
#import seaborn as sns # data visualization library based on matplotlib.
import matplotlib.pyplot as plt # collection of functions that make matplotlib work like MATLAB.

## scikit Preprocessing data libraries
from sklearn.preprocessing import MinMaxScaler # Transform features by scaling each feature to a given range.

## scikit Feature Extraction libraries
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts.

## scikit Pairwise metrics libraries
#implements utilities to evaluate pairwise distances or affinity of sets of samples.
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.metrics.pairwise import linear_kernel 

## scikit Cross validation iterators libraries
from sklearn.model_selection import GridSearchCV

# Unsupervised learner for implementing neighbor searches.
from sklearn.neighbors import NearestNeighbors

# setting display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Utils libraries

from utils import recommend


#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
raw_data = (data_folder + "\_raw")
processed_data = (data_folder + "\processed")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")



In [2]:
'''
Function to return the anime name that mtches de index number
'''
def from_index_to_title(index,df):
    anime = df
    return anime[anime.index == index]['name'].values[0]


'''
Function to return the matched index number of the anime name
'''
def from_title_to_index(title,df):
    anime = df
    return anime[anime["name"]==title].index.values[0]


'''
Function to find the closest title, It uses Levenshtein Distance to calculate the differences between sequences
'''
def match_the_score(a,b):
   return fuzz.ratio(a,b)


'''
Function to return the most similar title to the name a user typed
'''
def finding_the_closest_title(title,df):
    anime = df
    levenshtein_scores = list(enumerate(anime['name'].apply(match_the_score, b=title))) # Create a list wuth the matchin fuzz.ratio puntuation
    sorted_levenshtein_scores = sorted(levenshtein_scores, key=lambda x: x[1], reverse=True) # Sort from higher to lower the scores
    closest_title = from_index_to_title(sorted_levenshtein_scores[0][0],anime) # Getting the closest anime name/title
    distance_score = sorted_levenshtein_scores[0][1] # Getting the score
    return closest_title, distance_score

'''
This version of the function takes two lists as inputs: genres and types. 
If both lists have at least one value, the function filters the DataFrame 
to include only rows where the genre column matches one of the genres 
in the list and the type column matches one of the types in the list.
'''
def filtering(df, genres, types):
    all = df
    df['genre'] = df['genre'].str.split(', ')
    df = df.explode('genre')
    if genres and types:
        if "ALL" in genres and "ALL" in types:
            return all.drop_duplicates()
        elif "ALL" in genres:
            filtered = df[df['type'].isin(types)]
        elif "ALL" in types:
            filtered = df[df['genre'].isin(genres)]
        else:
            filtered = df[df['genre'].isin(genres) | df['type'].isin(types)]
        return filtered.drop_duplicates()
    elif genres:
        if "ALL" in genres:
            return all
        else:
            filtered = df[df['genre'].isin(genres)]
            return filtered.drop_duplicates()
    elif types:
        if "ALL" in types:
            return all
        else:
            filtered = df[df['type'].isin(types)]
            return filtered.drop_duplicates()
    else:
        return all

'''
Create dict of records with the filters selected - each row becomes a dictionary where key is column name and value is the data in the cell.
'''
def create_dict(names,gen,typ,n=200):
    #anime = joblib.load(processed_data  + "/" +  "_anime_to_compare_with_name.pkl")
    anime = pd.read_csv(processed_data + "/" + "_anime_to_compare_with_name.csv")# load anime df
    final_df = anime[anime['name'].isin(names)]
    final_df = final_df.drop(columns=["anime_id", "members"])
    blankIndex=[''] * len(final_df)
    final_df.index=blankIndex
    final_df = filtering(final_df,gen,typ)
    final_df = final_df.drop_duplicates(subset=["name"])
    final_df = final_df.drop_duplicates().head(n)
    if final_df.empty:
        sentence = print('WOW!!!! Sorry, there is no matches for the anime and options selected! \n Try again, you might have mroe luck')
        return sentence
    else:
        final_dict = final_df.to_dict('records')

        return final_dict

'''
A function that returns the names of the similar animes
for Unsupervised User content based recommendation system
'''
def print_similar_animes(query):
    ind = joblib.load(saved_models_folder + "/" + "model_based_content.pkl") # Load the trained model
    #anime = joblib.load(processed_data + "/" + "_anime_to_compare_with_name.pkl")
    anime = pd.read_csv(processed_data + "/" + "_anime_to_compare_with_name.csv")# load anime df
    closest_title, distance_score = finding_the_closest_title(query,anime) # find the closest title
       
    if distance_score == 100: # When a user does not make misspellings
        names = []
        errors = []
        print('These are the recommendations for similar animes to '+'\033[1m'+str(query)+'\033[0m'+'','\n')
        found_id = from_title_to_index(query,anime) # return the matched index number of the anime name
        array = ind[found_id][1:] # return the matched index number of the anime name that user did input
        indi = np.where(array==found_id) # return the position of the anime index that user did input (if it is in the list)
        array = np.delete(array, indi) # erase the anime index that matches the anime name that used did input
        #array = array[0:n] # print the number of anime recommendations that userd chosed
        for id in array:
            try :
                names.append(anime[anime.index == id]['name'].values[0])
            except IndexError :
                errors.append(id)
        return names

   # When a user makes misspellings    
    else:
        names = []
        errors = []
        print('I guess you misspelled the name\n Are you looking similitudes for the anime named '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n' + 'Here are the recommendations:')
        found_id = from_title_to_index(closest_title,anime) # return the matched index number of the anime name that user did input
        array = ind[found_id][1:] # create and array with anime indexes to recoomend according to the anime 
        indi = np.where(array==found_id) # return the position of the anime index that user did input (if it is in the list)
        array = np.delete(array, indi) # erase the anime index that matches the anime name that user did input
        #array = array[0:n] # print the number of anime recommendations that userd chosed
        for id in array:
            try :
                names.append(anime[anime.index == id]['name'].values[0])
            except IndexError :
                errors.append(id)
        return names

# Define the options for the multiselects
option_genre = ["ALL",'Drama', 'Romance', 'School', 'Supernatural', 'Action',
   'Adventure', 'Fantasy', 'Magic', 'Military', 'Shounen', 'Comedy',
   'Historical', 'Parody', 'Samurai', 'Sci-Fi', 'Thriller', 'Sports',
   'Super Power', 'Space', 'Slice of Life', 'Mecha', 'Music',
   'Mystery', 'Seinen', 'Martial Arts', 'Vampire', 'Shoujo', 'Horror',
   'Police', 'Psychological', 'Demons', 'Ecchi', 'Josei',
   'Shounen Ai', 'Game', 'Dementia', 'Harem', 'Cars', 'Kids',
   'Shoujo Ai', 'Hentai', 'Yaoi', 'Yuri']
option_type = ["ALL",'Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA']

gen = ["Shounen"]
typ = ['TV','Movie', 'OVA']

create_dict(print_similar_animes("Naruto"),gen,typ,30)

I guess you misspelled the name
 Are you looking similitudes for the anime named [1mnaruto[0m? 
Here are the recommendations:


[{'name': 'yakitate   japan',
  'english_title': 'Yakitate!! Japan',
  'japanses_title': '焼きたて!! ジャぱん',
  'genre': 'Comedy',
  'type': 'TV',
  'source': 'Manga',
  'duration': '24 min per ep',
  'episodes': 69.0,
  'rating': 'PG-13 - Teens 13 or older',
  'score': 7.92,
  'rank': 687.0,
  'synopsis': "While countries such as France, England, and Germany all have their own internationally celebrated bread, Japan simply does not have one that can match in reputation.\n\nThus after discovering the wonders of breadmaking at a young age, Kazuma Azuma embarks on a quest to create Japan's own unique national bread. And being blessed with unusually warm hands that allow dough to ferment faster, Azuma is able to bring his baking innovations to another level.\n\nAs he begins working at the prestigious Japanese bakery chain, Pantasia, Azuma encounters other talented bakers and experiences firsthand the competitive world of baking. Along with his newfound friends and rivals, Azuma strives to creat