In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel, linear_kernel
import string

In [2]:
data = pd.read_csv("datasets/clean_dataset.csv")

In [3]:
data.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,kimi no na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,fullmetal alchemist brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,steinsgate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
data['genre'].sample(30)

349      Action, Adventure, Comedy, Fantasy, Shounen, S...
5190                                 Comedy, Slice of Life
7471                                       Fantasy, Horror
5046                        Comedy, Fantasy, Magic, Parody
1027                               Romance, School, Shoujo
9534                                    Adventure, Fantasy
1417                       Action, Adventure, Martial Arts
352                            Action, Sci-Fi, Super Power
3224     Action, Adventure, Comedy, Sci-Fi, Shounen, Sp...
520                                Comedy, Romance, School
2964                      Action, Adventure, Sci-Fi, Space
10712                                               Hentai
7364           Action, Adventure, Drama, Military, Romance
7837                                         Comedy, Music
9999                                                Hentai
8890                                     Drama, Historical
606      Action, Comedy, Martial Arts, Shounen, Super P.

# Content-based Recommender:-

#### Creating an instance of TFIDF vectorizer and transforming the genre feature:-   

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',min_df=2,ngram_range=(1,3),strip_accents='unicode',token_pattern=r"\w+")

In [6]:
vec  = vectorizer.fit_transform(data['genre'])

In [7]:
vec.shape

(10881, 1848)

#### Creating the cosine similarity table for each anime:-

In [8]:
cosine_sim_matrix = linear_kernel(vec,vec)

In [9]:
sigmoid_sim_matrix = sigmoid_kernel(vec,vec)

In [10]:
# Getting the indices for the animes:
indices = pd.Series(data.index, index=data['name']).drop_duplicates()

#### Creating the recommend function:

In [13]:
def clean(text):
    
    # Remove all punctuation:
    for char in text:
        if char in string.punctuation+u'\N{DEGREE SIGN}'+'039':
            text = text.replace(char,"")
    
    # Convert to lowercase:
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(u'\N{DEGREE SIGN}','',text)
    text = text.lower()
    
    return text

def recommend(name, sim=sigmoid_sim_matrix):
    
    try:
        name = clean(name)
    
        # Get the index corresponding to original_title
        index = indices[name]

        # Get the pairwsie similarity scores 
        sim_scores = list(enumerate(sim[index]))

        # Sort the animes accoring to their similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Scores of the 10 most similar anime
        sim_scores = sim_scores[1:11]

        # list out the anime indices
        anime_indices = [i[0] for i in sim_scores]

        # Top 10 most similar movies
        return pd.DataFrame({'Anime name': data['name'].iloc[anime_indices].values,
                                     'Episodes':data['episodes'].iloc[anime_indices].values,
                                     'Rating': data['rating'].iloc[anime_indices].values})

    except:
        return 0

In [16]:
results = recommend("Bleach")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
results

Unnamed: 0,Anime name,Episodes,Rating
0,bleach movie 4 jigokuhen,1,7.75
1,bleach movie fade to black kimi no na wo yobu,1,7.66
2,katekyo hitman reborn,203,8.37
3,toriko jump super anime tour 2 special,1,7.13
4,toriko barbarian ivy wo hokaku seyo,1,6.85
5,naruto takigakure no shitou ore ga eiyuu datt...,1,6.83
6,codebreaker,13,7.03
7,tokyo esp,12,6.64
8,katekyo hitman reborn x ēldlive special,1,7.1
9,tokyo ravens,24,7.8
