In [1]:
import pandas as pd

# import Dataset 
anime = pd.read_csv("anime.csv", encoding = 'utf8')
anime.shape # shape
anime.columns
anime.genre # genre columns

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12294, dtype: object

In [2]:
anime.shape # shape

(12294, 7)

In [3]:
anime.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

In [4]:
anime.genre # genre columns

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12294, dtype: object

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# term frequency inverse document frequncy is a numerical statistic that is intended to reflect how important a word is to document in a collecion or cor

In [6]:
# Creating a Tfidf Vectorizer to remove all stop words
tfidf = TfidfVectorizer(stop_words = "english")    # taking stop words from tfid vectorizer

In [7]:
# Replacing the NaN values in overview column with empty string
anime["genre"].isnull().sum()
anime["genre"] = anime["genre"].fillna("general")

In [8]:
anime["genre"].isnull().sum()

0

In [9]:
# Preparing the Tfidf matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(anime.genre)   #Transform a count matrix to a normalized tf or tf-idf representation
tfidf_matrix.shape #12294, 47

(12294, 47)

In [10]:
# From the above matrix we need to find the similarity score.
# There are several metrics for this such as the euclidean, 
# the Pearson and the cosine similarity scores

# A numeric quantity to represent the similarity between 2 movies 
# Cosine similarity - metric is independent of magnitude and easy to calculate 
# cosine(x,y)= (x.y⊺)/(||x||.||y||)

In [11]:
# calculating the dot product using sklearn's linear_kernel()
from sklearn.metrics.pairwise import linear_kernel

In [12]:
# Computing the cosine similarity on Tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
# creating a mapping of anime name to index number 
anime_index = pd.Series(anime.index, index = anime['name']).drop_duplicates()

In [14]:
anime_id = anime_index["Assassins (1995)"]
anime_id

22

In [21]:
def get_recommendations(Name, topN):    
    # topN = 10
    # Getting the movie index using its title 
    anime_id = anime_index[Name]
    
    # Getting the pair wise similarity score for all the anime's with that 
    # anime
    cosine_scores = list(enumerate(cosine_sim_matrix[anime_id]))
    
    # Sorting the cosine_similarity scores based on scores 
    cosine_scores = sorted(cosine_scores, key=lambda x:x[1], reverse = True)
    
    # Get the scores of top N most similar movies 
    cosine_scores_N = cosine_scores[0: topN+1]
    
    # Getting the movie index 
    anime_idx  =  [i[0] for i in cosine_scores_N]
    anime_scores =  [i[1] for i in cosine_scores_N]
    
    # Similar movies and scores
    anime_similar_show = pd.DataFrame(columns=["name", "Score"])
    anime_similar_show["name"] = anime.loc[anime_idx, "name"]
    anime_similar_show["Score"] = anime_scores
    anime_similar_show.reset_index(inplace = True)  
    print (anime_similar_show)
    

In [22]:
# Enter your anime and number of anime's to be recommended
anime_index["Bad Boys (1995)"]

118

In [23]:
get_recommendations("Bad Boys (1995)", topN = 10)

    index                                               name     Score
0     118                                    Bad Boys (1995)  1.000000
1   10919                              No Game No Life Movie  1.000000
2   10436  Super Real Mahjong: Mahjong Battle Scramble - ...  0.859206
3    4290                       Raising Victor Vargas (2002)  0.827579
4    5882                     xXx: State of the Union (2005)  0.800258
5    5968           Pusher II: With Blood on My Hands (2004)  0.800258
6    6116                                  Revolution (1985)  0.800258
7    6677             World on a Wire (Welt am Draht) (1973)  0.800258
8   10435  Super Real Mahjong: Kasumi Miki Shouko no Haji...  0.800258
9    4628                            Italian Job, The (1969)  0.787476
10   6812                    Midnight Meat Train, The (2008)  0.739464
