### Content Based Recommender

Importing neccessary libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
anime = pd.read_csv('/content/drive/My Drive/anime data/anime.csv')
ratings = pd.read_csv('/content/drive/My Drive/anime data/rating.csv')

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Cleaning up the name column from non-alphanumeric characters


In [None]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [None]:
#converting the name column into string datatype
anime['name'] = anime['name'].astype('str')

In [None]:
#creating the function to remove the characters
import re
def clean_up(s):
  new_s = re.sub(r"[^\sa-zA-Z0-9\.\,\;]","",str(s))
  return new_s


In [None]:
anime['name'] = anime['name'].apply(clean_up)

In [None]:
#renaming rating in anime df rating to average rating
anime.rename(columns={'rating':'average_rating'}, inplace=True)

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,average_rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [None]:
len(ratings)

7813737

In [None]:
#dropping all rows with rating value of -1. ie. the user_id did not rate the anime
ratings = ratings[ratings['rating'] != -1]
len(ratings)

6337241

In [None]:
#renaming rating in anime df rating to average rating
anime.rename(columns={'rating':'average_rating'}, inplace=True)

### Logic for Content Based

**Future Findings**
1. Getting the anime overview description. ie summary of the plot description
2. Getting 3 major casts of the anime
3. Getting directors of the anime

In [None]:
# creating a function to split the values in the genre column
def split_genre(x):
  return x.split(', ')


In [None]:
#converting the name column into string datatype
anime['genre'] = anime['genre'].astype('str')

In [None]:
#converting the name column into string datatype
anime['type'] = anime['type'].astype('str')

In [None]:
#applying the function
anime['genre'] = anime['genre'].apply(split_genre)

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,average_rating,members
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665
2,28977,Gintama,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572
4,9969,Gintama039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266


In [None]:
#taking only the features we would be making use of
features = ['name','genre','type']
anime = anime[features]

In [None]:
anime.head(3)

Unnamed: 0,name,genre,type
0,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie
1,Fullmetal Alchemist Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV
2,Gintama,"[Action, Comedy, Historical, Parody, Samurai, ...",TV


The next step would be to convert the names and keyword instances into lowercase and strip all the spaces between them.

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if nothing exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['genre', 'type']

for feature in features:
    anime[feature] = anime[feature].apply(clean_data)

In [None]:
anime.head(3)

Unnamed: 0,name,genre,type
0,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie
1,Fullmetal Alchemist Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv
2,Gintama,"[action, comedy, historical, parody, samurai, ...",tv


Finally, we create a metadata soup, in which we feed the vectorizer. This metadata are strings separated by whitespace

In [None]:
def create_soup(x):
    return ' '.join(x['genre']) + ' ' + ' '.join(x['type'])

In [None]:
# Create a new soup feature
anime['soup'] = anime.apply(create_soup, axis=1)

In [None]:
anime.head()

Unnamed: 0,name,genre,type,soup
0,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,drama romance school supernatural m o v i e
1,Fullmetal Alchemist Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,action adventure drama fantasy magic military ...
2,Gintama,"[action, comedy, historical, parody, samurai, ...",tv,action comedy historical parody samurai sci-fi...
3,Steins;Gate,"[sci-fi, thriller]",tv,sci-fi thriller t v
4,Gintama039;,"[action, comedy, historical, parody, samurai, ...",tv,action comedy historical parody samurai sci-fi...


We make use of the CountVectorizer library. This will give us a matrix where each column represents a word in the overview vocabulary (all the words that appear in at least one document), and each row represents an anime, as before.

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime['soup'])

In [None]:
count_matrix.shape

(12294, 45)

From the above, there are 73,881 vocabularies in the metadata that you fed to it.
Next, we will use the cosine_similarity to measure the distance between the embeddings.

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

We create a function to get top 10 similar animes, based on it's features

In [None]:
#Construct a reverse map of indices and anime titles
indices = pd.Series(anime.index, index=anime['name']).drop_duplicates()

In [None]:
indices.head(3)

name
Kimi no Na wa.                     0
Fullmetal Alchemist Brotherhood    1
Gintama                            2
dtype: int64

In [None]:
# Function that takes in anime title as input and outputs most similar anime
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return anime['name'].iloc[anime_indices]

### Testing with Death Note

In [None]:
recommendations = get_recommendations('Death Note')
print(recommendations)

778                                     Death Note Rewrite
981                                        Mousou Dairinin
144                          Higurashi no Naku Koro ni Kai
334                              Higurashi no Naku Koro ni
1383                         Higurashi no Naku Koro ni Rei
833                               Jigoku Shoujo Mitsuganae
2691                     Yakushiji Ryouko no Kaiki Jikenbo
6323                               Saint Luminous Jogakuin
10785    Yakushiji Ryouko no Kaiki Jikenbo Hamachou, Vo...
445                                         Mirai Nikki TV
Name: name, dtype: object


### Content Based with new Anime Dataset

In [1]:
import pandas as pd
import numpy as np

In [3]:
anime_new = pd.read_csv('/content/drive/My Drive/anime data/Anime_data.csv')
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [4]:
len(anime_new)

17002

### Following the exact same steps for the new data

Feature columns have already been split into a list containing them

In [5]:
anime_new['Synopsis'].head()

0    In the year 2071, humanity has colonized sever...
1    Another day, another bounty—such is the life o...
2    Vash the Stampede is the man with a $$60,000,0...
3    Witches are individuals with special powers li...
4    It is the dark century and the people are suff...
Name: Synopsis, dtype: object

Applying vectorization to the plot summary to get other similar animes, using tf-idf vwctorizer

In [6]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
anime_new['Synopsis'] = anime_new['Synopsis'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(anime_new['Synopsis'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(17002, 43913)

With this matrix in hand, we compute similarity scores using cosine similarity

In [7]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
cosine_sim.shape

(17002, 17002)

We define a function that takes in an anime title as an input and outputs a list of the 10 most similar animes. Firstly, for this, you need a reverse mapping of anime titles and DataFrame 

In [9]:
#Construct a reverse map of indices and anime titles
indices = pd.Series(anime_new.index, index=anime_new['Title']).drop_duplicates()

In [10]:
indices.head()

Title
Cowboy Bebop                       0
Cowboy Bebop: Tengoku no Tobira    1
Trigun                             2
Witch Hunter Robin                 3
Bouken Ou Beet                     4
dtype: int64

In [11]:
# Function that takes in anime title as input and outputs most similar anime
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return anime_new['Title'].iloc[anime_indices]

In [12]:
get_recommendations('Death Note')

2525              Death Note: Rewrite
2915                       Soul Eater
645              Shinigami no Ballad.
12227              Dia Horizon (Kabu)
15808      YAT Anshin! Uchuu Ryokou 2
16618                  Kite Liberator
4229      Ayatsuri Haramase DreamNote
466                   Yami no Matsuei
630      Bleach: Memories in the Rain
1082           Yami no Shihosha Judge
Name: Title, dtype: object

Similarity using genre, producer, studio, source 




In [13]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"['Action', 'Space', 'Drama', 'Mystery', 'Sci-Fi']","Another day, another bounty—such is the life o...",Movie,"['Sunrise', 'Bandai Visual']",['Bones'],8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...","Vash the Stampede is the man with a $$60,000,0...",TV,['Victor Entertainment'],['Madhouse'],8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [14]:
anime_new.dtypes

Anime_id        int64
Title          object
Genre          object
Synopsis       object
Type           object
Producer       object
Studio         object
Rating        float64
ScoredBy      float64
Popularity    float64
Members       float64
Episodes      float64
Source         object
Aired          object
Link           object
dtype: object

In [15]:
#converting necessary columns to strings for processing
anime_new['Genre'] = anime_new["Genre"].astype('str')
anime_new['Producer'] = anime_new["Producer"].astype('str')
anime_new['Studio'] = anime_new["Studio"].astype('str')

In [16]:
# Function to convert all lists into strings for splitting into lists
import re 
def clean_data_lists(s):
    return re.sub(r"[^\sa-zA-Z0-9\.\,\;]","",str(s))

In [17]:
# Apply clean_data function to your features.
features = ['Genre', 'Producer', 'Studio', 'Source']

for feature in features:
    anime_new[feature] = anime_new[feature].apply(clean_data_lists)

NB: we ignore the spaces between producer names and studio names so that the vectorizer doesn't count the Johnny of "Johnny Depp" and "Johnny Galecki" as the same. That is just an example

In [18]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, SciFi, Space","In the year 2071, humanity has colonized sever...",TV,Bandai Visual,Sunrise,8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Space, Drama, Mystery, SciFi","Another day, another bounty—such is the life o...",Movie,"Sunrise, Bandai Visual",Bones,8.41,111187.0,475.0,179899.0,1.0,Original,"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"Action, SciFi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",TV,Victor Entertainment,Madhouse,8.31,197451.0,158.0,372709.0,26.0,Manga,"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [19]:
#creating a function to split the particular columns into lists again
def split_columns(x):
  return x.split(',')

In [20]:
#applying the function to the same set of columns
for feature in features:
    anime_new[feature] = anime_new[feature].apply(split_columns)

In [21]:
anime_new.head(3)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"[Action, Adventure, Comedy, Drama, SciFi, ...","In the year 2071, humanity has colonized sever...",TV,[Bandai Visual],[Sunrise],8.81,363889.0,39.0,704490.0,26.0,[Original],"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,"[Action, Space, Drama, Mystery, SciFi]","Another day, another bounty—such is the life o...",Movie,"[Sunrise, Bandai Visual]",[Bones],8.41,111187.0,475.0,179899.0,1.0,[Original],"Sep 1, 2001",https://myanimelist.net/anime/5/Cowboy_Bebop__...
2,6,Trigun,"[Action, SciFi, Adventure, Comedy, Drama, ...","Vash the Stampede is the man with a $$60,000,0...",TV,[Victor Entertainment],[Madhouse],8.31,197451.0,158.0,372709.0,26.0,[Manga],"Apr 1, 1998 to Sep 30, 1998",https://myanimelist.net/anime/6/Trigun


In [22]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if source exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [23]:
#applying the clean_data function
for feature in features:
    anime_new[feature] = anime_new[feature].apply(clean_data)

In [24]:
def create_soup(x):
    return ' '.join(x['Genre']) + ' ' + ' '.join(x['Producer']) + ' ' + ' '.join(x['Studio']) + ' ' + ' '.join(x['Source'])

In [25]:
# Create a new soup feature
anime_new['soup'] = anime_new.apply(create_soup, axis=1)

In [26]:
anime_new['soup'].head()

0    action adventure comedy drama scifi space band...
1    action space drama mystery scifi sunrise banda...
2    action scifi adventure comedy drama shounen vi...
3    action magic police supernatural drama mystery...
4    adventure fantasy shounen supernatural nan toe...
Name: soup, dtype: object

In [27]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime_new['soup'])

In [28]:
count_matrix.shape

(17002, 1382)

In [29]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

Using the get_recommendations function for the new cosine similarity

In [30]:
get_recommendations('Death Note', cosine_sim2) #based on features

2525                          Death Note: Rewrite
2516                   Majin Tantei Nougami Neuro
9                                         Monster
3541                              Mouryou no Hako
2531      Gyakkyou Burai Kaiji: Ultimate Survivor
3067                   Top Secret: The Revelation
5426          Gyakkyou Burai Kaiji: Hakairoku-hen
5725                       Hunter x Hunter (2011)
7119    Hunter x Hunter Movie 2: The Last Mission
8552                                 Death Parade
Name: Title, dtype: object

In [31]:
get_recommendations('Death Note', cosine_sim) #based on storyline

2525              Death Note: Rewrite
2915                       Soul Eater
645              Shinigami no Ballad.
12227              Dia Horizon (Kabu)
15808      YAT Anshin! Uchuu Ryokou 2
16618                  Kite Liberator
4229      Ayatsuri Haramase DreamNote
466                   Yami no Matsuei
630      Bleach: Memories in the Rain
1082           Yami no Shihosha Judge
Name: Title, dtype: object

In [33]:
#writing a function to get recommendations using both storyline and features based on 50% for both
def get_recommendations_both(title, cos_sim1, cos_sim2):
  # Get the index of the movie that matches the title
  idx = indices[title]

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores1 = list(enumerate(cos_sim1[idx]))

  # Get the pairwsie similarity scores of all anime using similarity1
  sim_scores2 = list(enumerate(cos_sim2[idx]))

  #Getting the average of both similarity scores
  sim_scores_avg = [(sim_scores1[i][0],(sim_scores1[i][1] + sim_scores2[i][1])/2) for i in range(len(sim_scores1))]

  # Sort the movies based on the similarity scores
  sim_scores_avg = sorted(sim_scores_avg, key=lambda x: x[1], reverse=True)

  #Get the scores of the 10 most similar movies
  sim_scores_avg = sim_scores_avg[1:11]

  #Get the movie indices
  anime_indices = [i[0] for i in sim_scores_avg]

  #Return the top 10 most similar movies
  return anime_new['Title'].iloc[anime_indices]


In [34]:
get_recommendations_both('Death Note',cosine_sim,cosine_sim2)

2525                          Death Note: Rewrite
2516                   Majin Tantei Nougami Neuro
9                                         Monster
8552                                 Death Parade
2531      Gyakkyou Burai Kaiji: Ultimate Survivor
3541                              Mouryou no Hako
5426          Gyakkyou Burai Kaiji: Hakairoku-hen
5725                       Hunter x Hunter (2011)
3067                   Top Secret: The Revelation
7119    Hunter x Hunter Movie 2: The Last Mission
Name: Title, dtype: object