In [3]:
#importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
#loading the dataset
anime_data = pd.read_csv('DATA/Anime_data.csv')

In [5]:
anime_data = anime_data.rename(columns={'Anime_id': 'anime_id', 'Title': 'title','Genre':'genres','Synopsis':'synopsis',
                                       'Type':'anime_type','Producer':'producer','Studio':'studio','Rating':'rating','ScoredBy':'scored_by','Popularity':'popularity',
                                       'Members':'members','Episodes':'num_of_episodes','Source':'anime_source'})

In [6]:
#creating the function to remove the characters
import re
def clean_up(s):
    return re.sub(r"[^\sa-zA-Z0-9\.\,\;]","",str(s))

In [7]:
anime_data['title'] = anime_data['title'].apply(clean_up)
anime_data['genres'] = anime_data['genres'].apply(clean_up)
anime_data['producer'] = anime_data['producer'].apply(clean_up)
anime_data['studio'] = anime_data['studio'].apply(clean_up)
anime_data['synopsis'] = anime_data['synopsis'].apply(clean_up)

In [8]:
anime_data.head(2)

Unnamed: 0,anime_id,title,genres,synopsis,anime_type,producer,studio,rating,scored_by,popularity,members,num_of_episodes,anime_source,Aired,Link
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...",TV,Bandai Visual,Sunrise,8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop Tengoku no Tobira,"Action, Space, Drama, Mystery, SciFi","Another day, another bountysuch is the life of...",Movie,"Sunrise, Bandai Visual",Bones,8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...


# Content based using genre, producer, source and type

In [9]:
#dropping the null rows
anime_data.dropna(axis = 0,how = 'any',inplace=True)

In [10]:
anime_data.isnull().sum()

anime_id           0
title              0
genres             0
synopsis           0
anime_type         0
producer           0
studio             0
rating             0
scored_by          0
popularity         0
members            0
num_of_episodes    0
anime_source       0
Aired              0
Link               0
dtype: int64

In [11]:
#selecting the features for this recommendation
features = ['genres','producer','studio','anime_source','anime_type']

In [12]:
# creating a function to split the values in the genre and producer column
def split_column(x):
    return x.split(', ')

In [13]:
#applying the function
anime_data['genres'] = anime_data['genres'].apply(split_column)
anime_data['producer'] = anime_data['producer'].apply(split_column)

In [12]:
anime_data.head(2)

Unnamed: 0,anime_id,title,genres,synopsis,anime_type,producer,studio,rating,scored_by,popularity,members,num_of_episodes,anime_source,Aired,Link
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, SciFi, Space]","In the year 2071, humanity has colonized sever...",TV,[Bandai Visual],Sunrise,8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop Tengoku no Tobira,"[Action, Space, Drama, Mystery, SciFi]","Another day, another bountysuch is the life of...",Movie,"[Sunrise, Bandai Visual]",Bones,8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...


In [13]:
#taking only the features we would be making use of
features = ['genres','producer','studio','anime_source','anime_type']
anime_rec1 = anime_data.copy()
anime_rec1 = anime_rec1[['title','genres','producer','studio','anime_source','anime_type']]
anime_rec1.head(2)

Unnamed: 0,title,genres,producer,studio,anime_source,anime_type
0,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, SciFi, Space]",[Bandai Visual],Sunrise,Original,TV
1,Cowboy Bebop Tengoku no Tobira,"[Action, Space, Drama, Mystery, SciFi]","[Sunrise, Bandai Visual]",Bones,Original,Movie


Striping spaces between the words in the feature so that the vectorizer would not count the Chris in 'Chris Hemsworth' and
'chris Evans' as the same. After the clean up they will be represemted as 'chrishemsworth' and 'chrisevans'.

In [14]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if nothing exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [15]:
# Apply clean_data function to your features.
for feature in features:
    anime_rec1[feature] = anime_rec1[feature].apply(clean_data)

In [16]:
anime_rec1.head(2)

Unnamed: 0,title,genres,producer,studio,anime_source,anime_type
0,Cowboy Bebop,"[action, adventure, comedy, drama, scifi, space]",[bandaivisual],sunrise,original,tv
1,Cowboy Bebop Tengoku no Tobira,"[action, space, drama, mystery, scifi]","[sunrise, bandaivisual]",bones,original,movie


Creating a soup which is a string that will contain all the features to be fed to the vectorizer

In [17]:
def create_soup(x):
    return ' '.join(x['genres']) + ' ' +' '.join(x['producer']) + ' ' + ' '.join(x['studio']) + ' '+\
                ' '.join(x['anime_source']) + ' '  + ' '.join(x['anime_type'])

In [18]:
# Create a new soup feature
anime_rec1['soup'] = anime_rec1.apply(create_soup, axis=1)

In [19]:
anime_rec1.head(2)

Unnamed: 0,title,genres,producer,studio,anime_source,anime_type,soup
0,Cowboy Bebop,"[action, adventure, comedy, drama, scifi, space]",[bandaivisual],sunrise,original,tv,action adventure comedy drama scifi space band...
1,Cowboy Bebop Tengoku no Tobira,"[action, space, drama, mystery, scifi]","[sunrise, bandaivisual]",bones,original,movie,action space drama mystery scifi sunrise banda...


We make use of the CountVectorizer library. This is to ensure there is no down weighting of any of the elemnets. Even if appeared more than once it will be considered. This will give us a matrix where each column represents a word in the overview vocabulary (all the words that appear in at least one document), and each row represents an anime, as before.

In [21]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime_rec1['soup'])

In [22]:
count_matrix.shape

(12706, 1120)

From the above, there are 1120 vocabularies in the metadata that you fed to it.
Next, we will use the cosine_similarity to measure the distance between the embeddings.

In [23]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim1 = cosine_similarity(count_matrix, count_matrix)

We create a function to get top 10 similar animes, based on it's features

In [24]:
#Construct a reverse map of indices and anime titles
indices = pd.Series(anime_rec1.index, index=anime_rec1['title']).drop_duplicates()

In [25]:
indices.head()

title
Cowboy Bebop                      0
Cowboy Bebop Tengoku no Tobira    1
Trigun                            2
Witch Hunter Robin                3
Bouken Ou Beet                    4
dtype: int64

In [26]:
# Function that takes in anime title as input and outputs most similar anime
def get_recommendations(title, cosine_sim=cosine_sim1):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return anime_rec1['title'].iloc[anime_indices]

# Testing with Naruto

In [27]:
recommendations = get_recommendations('Naruto')
print(recommendations)

1434                                     Naruto Shippuuden
5551               Naruto Shippuuden Movie 5  Blood Prison
5585     Naruto Honoo no Chuunin Shiken Naruto vs. Kono...
10145                                     D.Grayman Hallow
245                                                 Bleach
6483                             The Last Naruto the Movie
8671                               Boruto Naruto the Movie
11643                       Boruto Jump Festa 2016 Special
8483                                         Nano Invaders
2100                             Naruto Shippuuden Movie 1
Name: title, dtype: object


# Recommendation based on storyline

In [28]:
#making a copy of the data
anime_rec2 = anime_data.copy()
anime_rec2 = anime_rec2[['title','synopsis']]

In [29]:
anime_rec2.head(2)

Unnamed: 0,title,synopsis
0,Cowboy Bebop,"In the year 2071, humanity has colonized sever..."
1,Cowboy Bebop Tengoku no Tobira,"Another day, another bountysuch is the life of..."


In [30]:
#converting the synopsis column to word vector
from sklearn.feature_extraction.text import TfidfVectorizer

# Using Abhishek Thakur's arguments for TF-IDF
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Filling NaNs with empty string
anime_rec2['synopsis'] = anime_rec2['synopsis'].fillna('')

# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(anime_rec2['synopsis'])

tfv_matrix.shape

(12706, 29021)

So about 29,000 unique words were used in the plot summaries to describe our 12,000 movies

In [31]:
#calculating similarity scores
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix using linear kernel
cos_sim2 = linear_kernel(tfv_matrix, tfv_matrix)

In [32]:
cos_sim2.shape

(12706, 12706)

We define a function that takes in an anime title as an input and outputs a list of the 10 most similar animes. Firstly, for this, you need a reverse mapping of anime titles and DataFrame 

In [33]:
# Reverse mapping of indices and movie titles
indices = pd.Series(anime_rec2.index, index=anime_rec2['title']).drop_duplicates()

In [34]:
# Credit to Ibtesam Ahmed for the skeleton code
def get_recommendations(title, sig=cos_sim2):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return anime_rec2['title'].iloc[movie_indices]

# Testing with Naruto

In [35]:
recommendations = get_recommendations('Naruto')
print(recommendations)

1434                                    Naruto Shippuuden
8671                              Boruto Naruto the Movie
6003       Naruto SD Rock Lee no Seishun FullPower Ninden
504     Naruto Takigakure no Shitou  Ore ga Eiyuu Datt...
6483                            The Last Naruto the Movie
3240         Naruto Shippuuden  Shippuu Konoha Gakuen Den
5551              Naruto Shippuuden Movie 5  Blood Prison
5585    Naruto Honoo no Chuunin Shiken Naruto vs. Kono...
2100                            Naruto Shippuuden Movie 1
4139    Naruto Shippuuden Movie 3  Hi no Ishi wo Tsugu...
Name: title, dtype: object


In [36]:
#writing a function to get recommendations using both storyline and features based on 50% for both
def get_recommendations_both(title, cos_sim1, cos_sim2):
  # Get the index of the movie that matches the title
  idx = indices[title]

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores1 = list(enumerate(cos_sim1[idx]))

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores2 = list(enumerate(cos_sim2[idx]))

  #Getting the average of both similarity scores
  sim_scores_avg = [(sim_scores1[i][0],(sim_scores1[i][1] + sim_scores2[i][1])/2) for i in range(len(sim_scores1))]

  # Sort the movies based on the similarity scores
  sim_scores_avg = sorted(sim_scores_avg, key=lambda x: x[1], reverse=True)

  #Get the scores of the 10 most similar movies
  sim_scores_avg = sim_scores_avg[1:11]

  #Get the movie indices
  anime_indices = [i[0] for i in sim_scores_avg]

  #Return the top 10 most similar movies
  return anime_data['title'].iloc[anime_indices]

In [38]:
get_recommendations_both('Naruto',cosine_sim1,cos_sim2)

1434                                     Naruto Shippuuden
8671                               Boruto Naruto the Movie
5551               Naruto Shippuuden Movie 5  Blood Prison
5585     Naruto Honoo no Chuunin Shiken Naruto vs. Kono...
6483                             The Last Naruto the Movie
504      Naruto Takigakure no Shitou  Ore ga Eiyuu Datt...
2100                             Naruto Shippuuden Movie 1
6003        Naruto SD Rock Lee no Seishun FullPower Ninden
10145                                     D.Grayman Hallow
4139     Naruto Shippuuden Movie 3  Hi no Ishi wo Tsugu...
Name: title, dtype: object