### Content Based with new Anime Dataset

Importing neccessary libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
anime_new = pd.read_csv('/content/drive/My Drive/anime data/Anime_data.csv')
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [3]:
len(anime_new)

17002

Feature columns have already been split into a list containing them

In [4]:
anime_new['Synopsis'].head()

0    In the year 2071, humanity has colonized sever...
1    Another day, another bounty—such is the life o...
2    Vash the Stampede is the man with a $$60,000,0...
3    Witches are individuals with special powers li...
4    It is the dark century and the people are suff...
Name: Synopsis, dtype: object

Applying vectorization to the plot summary to get other similar animes, using tf-idf vectorizer

In [5]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
anime_new['Synopsis'] = anime_new['Synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(anime_new['Synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(17002, 43913)

With this matrix in hand, we compute similarity scores using cosine similarity

In [6]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

We define a function that takes in an anime title as an input and outputs a list of the 10 most similar animes. Firstly, for this, you need a reverse mapping of anime titles and DataFrame 

In [7]:
#Construct a reverse map of indices and anime titles
indices = pd.Series(anime_new.index, index=anime_new['Title']).drop_duplicates()

In [8]:
indices.head()

Title
Cowboy Bebop                       0
Cowboy Bebop: Tengoku no Tobira    1
Trigun                             2
Witch Hunter Robin                 3
Bouken Ou Beet                     4
dtype: int64

In [9]:
# Function that takes in anime title as input and outputs most similar anime
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return anime_new['Title'].iloc[anime_indices]

In [10]:
get_recommendations('Death Note')

2525              Death Note: Rewrite
2915                       Soul Eater
645              Shinigami no Ballad.
12227              Dia Horizon (Kabu)
15808      YAT Anshin! Uchuu Ryokou 2
16618                  Kite Liberator
4229      Ayatsuri Haramase DreamNote
466                   Yami no Matsuei
630      Bleach: Memories in the Rain
1082           Yami no Shihosha Judge
Name: Title, dtype: object

Similarity using genre, producer, studio, source 




In [11]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [12]:
anime_new.dtypes

Anime_id        int64
Title          object
Genre          object
Synopsis       object
Type           object
Producer       object
Studio         object
Rating        float64
ScoredBy      float64
Popularity    float64
Members       float64
Episodes      float64
Source         object
Aired          object
Link           object
dtype: object

In [13]:
#converting necessary columns to strings for processing
anime_new['Genre'] = anime_new["Genre"].astype('str')
anime_new['Producer'] = anime_new["Producer"].astype('str')
anime_new['Studio'] = anime_new["Studio"].astype('str')

In [14]:
# Function to convert all lists into strings for splitting into lists
import re 
def clean_data_lists(s):
    return re.sub(r"[^\sa-zA-Z0-9\.\,\;]","",str(s))

In [15]:
# Apply clean_data function to your features.
features = ['Genre', 'Producer', 'Studio', 'Source']

for feature in features:
    anime_new[feature] = anime_new[feature].apply(clean_data_lists)

NB: we ignore the spaces between producer names and studio names so that the vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same. That is just an example

In [16]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...",TV,Bandai Visual,Sunrise,8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Space, Drama, Mystery, SciFi","Another day, another bounty—such is the life o...",Movie,"Sunrise, Bandai Visual",Bones,8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",TV,Victor Entertainment,Madhouse,8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [17]:
#creating a function to split the particular columns into lists again
def split_columns(x):
  return x.split(',')

In [18]:
#applying the function to the same set of columns
for feature in features:
    anime_new[feature] = anime_new[feature].apply(split_columns)

In [19]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, SciFi, ...","In the year 2071, humanity has colonized sever...",TV,[Bandai Visual],[Sunrise],8.81,363889.0,39.0,704490.0,26.0,[Original],"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Space, Drama, Mystery, SciFi]","Another day, another bounty—such is the life o...",Movie,"[Sunrise, Bandai Visual]",[Bones],8.41,111187.0,475.0,179899.0,1.0,[Original],"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"[Action, SciFi, Adventure, Comedy, Drama, ...","Vash the Stampede is the man with a $$60,000,0...",TV,[Victor Entertainment],[Madhouse],8.31,197451.0,158.0,372709.0,26.0,[Manga],"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [20]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if source exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [21]:
#applying the clean_data function
for feature in features:
    anime_new[feature] = anime_new[feature].apply(clean_data)

In [22]:
def create_soup(x):
    return ' '.join(x['Genre']) + ' ' + ' '.join(x['Producer']) + ' ' + ' '.join(x['Studio']) + ' ' + ' '.join(x['Source'])

In [23]:
# Create a new soup feature
anime_new['soup'] = anime_new.apply(create_soup, axis=1)

In [24]:
anime_new['soup'].head()

0    action adventure comedy drama scifi space band...
1    action space drama mystery scifi sunrise banda...
2    action scifi adventure comedy drama shounen vi...
3    action magic police supernatural drama mystery...
4    adventure fantasy shounen supernatural nan toe...
Name: soup, dtype: object

In [25]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime_new['soup'])

In [26]:
count_matrix.shape

(17002, 1382)

In [27]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

Using the get_recommendations function for the new cosine similarity

In [28]:
get_recommendations('Death Note', cosine_sim2) #based on features

2525                          Death Note: Rewrite
2516                   Majin Tantei Nougami Neuro
9                                         Monster
3541                              Mouryou no Hako
2531      Gyakkyou Burai Kaiji: Ultimate Survivor
3067                   Top Secret: The Revelation
5426          Gyakkyou Burai Kaiji: Hakairoku-hen
5725                       Hunter x Hunter (2011)
7119    Hunter x Hunter Movie 2: The Last Mission
8552                                 Death Parade
Name: Title, dtype: object

In [29]:
get_recommendations('Death Note', cosine_sim) #based on storyline

2525              Death Note: Rewrite
2915                       Soul Eater
645              Shinigami no Ballad.
12227              Dia Horizon (Kabu)
15808      YAT Anshin! Uchuu Ryokou 2
16618                  Kite Liberator
4229      Ayatsuri Haramase DreamNote
466                   Yami no Matsuei
630      Bleach: Memories in the Rain
1082           Yami no Shihosha Judge
Name: Title, dtype: object

In [30]:
#writing a function to get recommendations using both storyline and features based on 50% for both
def get_recommendations_both(title, cos_sim1, cos_sim2):
  # Get the index of the movie that matches the title
  idx = indices[title]

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores1 = list(enumerate(cos_sim1[idx]))

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores2 = list(enumerate(cos_sim2[idx]))

  #Getting the average of both similarity scores
  sim_scores_avg = [(sim_scores1[i][0],(sim_scores1[i][1] + sim_scores2[i][1])/2) for i in range(len(sim_scores1))]

  # Sort the movies based on the similarity scores
  sim_scores_avg = sorted(sim_scores_avg, key=lambda x: x[1], reverse=True)

  #Get the scores of the 10 most similar movies
  sim_scores_avg = sim_scores_avg[1:11]

  #Get the movie indices
  anime_indices = [i[0] for i in sim_scores_avg]

  #Return the top 10 most similar movies
  return anime_new['Title'].iloc[anime_indices]


In [31]:
get_recommendations_both('Death Note',cosine_sim,cosine_sim2)

2525                          Death Note: Rewrite
2516                   Majin Tantei Nougami Neuro
9                                         Monster
8552                                 Death Parade
2531      Gyakkyou Burai Kaiji: Ultimate Survivor
3541                              Mouryou no Hako
5426          Gyakkyou Burai Kaiji: Hakairoku-hen
5725                       Hunter x Hunter (2011)
3067                   Top Secret: The Revelation
7119    Hunter x Hunter Movie 2: The Last Mission
Name: Title, dtype: object