### Content Based Recommender

Importing neccessary libraries

In [None]:
import pandas as pd
import numpy as np

In [None]:
anime = pd.read_csv('/content/drive/My Drive/anime data/anime.csv')
ratings = pd.read_csv('/content/drive/My Drive/anime data/rating.csv')

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


Cleaning up the name column from non-alphanumeric characters


In [None]:
anime.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [None]:
#converting the name column into string datatype
anime['name'] = anime['name'].astype('str')

In [None]:
#creating the function to remove the characters
import re
def clean_up(s):
  new_s = re.sub(r"[^\sa-zA-Z0-9\.\,\;]","",str(s))
  return new_s


In [None]:
anime['name'] = anime['name'].apply(clean_up)

In [None]:
#renaming rating in anime df rating to average rating
anime.rename(columns={'rating':'average_rating'}, inplace=True)

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,average_rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [None]:
len(ratings)

7813737

In [None]:
#dropping all rows with rating value of -1. ie. the user_id did not rate the anime
ratings = ratings[ratings['rating'] != -1]
len(ratings)

6337241

In [None]:
#renaming rating in anime df rating to average rating
anime.rename(columns={'rating':'average_rating'}, inplace=True)

### Logic for Content Based

**Future Findings**
1. Getting the anime overview description. ie summary of the plot description
2. Getting 3 major casts of the anime
3. Getting directors of the anime

In [None]:
# creating a function to split the values in the genre column
def split_genre(x):
  return x.split(', ')


In [None]:
#converting the name column into string datatype
anime['genre'] = anime['genre'].astype('str')

In [None]:
#converting the name column into string datatype
anime['type'] = anime['type'].astype('str')

In [None]:
#applying the function
anime['genre'] = anime['genre'].apply(split_genre)

In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,average_rating,members
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665
2,28977,Gintama,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572
4,9969,Gintama039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266


In [None]:
#taking only the features we would be making use of
features = ['name','genre','type']
anime = anime[features]

In [None]:
anime.head(3)

Unnamed: 0,name,genre,type
0,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie
1,Fullmetal Alchemist Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV
2,Gintama,"[Action, Comedy, Historical, Parody, Samurai, ...",TV


The next step would be to convert the names and keyword instances into lowercase and strip all the spaces between them.

In [None]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if nothing exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
# Apply clean_data function to your features.
features = ['genre', 'type']

for feature in features:
    anime[feature] = anime[feature].apply(clean_data)

In [None]:
anime.head(3)

Unnamed: 0,name,genre,type
0,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie
1,Fullmetal Alchemist Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv
2,Gintama,"[action, comedy, historical, parody, samurai, ...",tv


Finally, we create a metadata soup, in which we feed the vectorizer. This metadata are strings separated by whitespace

In [None]:
def create_soup(x):
    return ' '.join(x['genre']) + ' ' + ' '.join(x['type'])

In [None]:
# Create a new soup feature
anime['soup'] = anime.apply(create_soup, axis=1)

In [None]:
anime.head()

Unnamed: 0,name,genre,type,soup
0,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,drama romance school supernatural m o v i e
1,Fullmetal Alchemist Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,action adventure drama fantasy magic military ...
2,Gintama,"[action, comedy, historical, parody, samurai, ...",tv,action comedy historical parody samurai sci-fi...
3,Steins;Gate,"[sci-fi, thriller]",tv,sci-fi thriller t v
4,Gintama039;,"[action, comedy, historical, parody, samurai, ...",tv,action comedy historical parody samurai sci-fi...


We make use of the CountVectorizer library. This will give us a matrix where each column represents a word in the overview vocabulary (all the words that appear in at least one document), and each row represents an anime, as before.

In [None]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(anime['soup'])

In [None]:
count_matrix.shape

(12294, 45)

From the above, there are 73,881 vocabularies in the metadata that you fed to it.
Next, we will use the cosine_similarity to measure the distance between the embeddings.

In [None]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

We create a function to get top 10 similar animes, based on it's features

In [None]:
#Construct a reverse map of indices and anime titles
indices = pd.Series(anime.index, index=anime['name']).drop_duplicates()

In [None]:
indices.head(3)

name
Kimi no Na wa.                     0
Fullmetal Alchemist Brotherhood    1
Gintama                            2
dtype: int64

In [None]:
# Function that takes in anime title as input and outputs most similar anime
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the anime that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar anime
    sim_scores = sim_scores[1:11]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar anime
    return anime['name'].iloc[anime_indices]

### Testing with Death Note

In [None]:
recommendations = get_recommendations('Death Note')
print(recommendations)

778                                     Death Note Rewrite
981                                        Mousou Dairinin
144                          Higurashi no Naku Koro ni Kai
334                              Higurashi no Naku Koro ni
1383                         Higurashi no Naku Koro ni Rei
833                               Jigoku Shoujo Mitsuganae
2691                     Yakushiji Ryouko no Kaiki Jikenbo
6323                               Saint Luminous Jogakuin
10785    Yakushiji Ryouko no Kaiki Jikenbo Hamachou, Vo...
445                                         Mirai Nikki TV
Name: name, dtype: object
