In [99]:
import os 
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [101]:
anime_df = pd.read_csv("/kaggle/input/anime-recommendation-database-2020/anime_with_synopsis.csv")
anime_df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [102]:
#number of missing values in the dataset
anime_df.isnull().sum()

MAL_ID       0
Name         0
Score        0
Genres       0
sypnopsis    8
dtype: int64

In [103]:
anime_df.dropna(inplace=True)

In [104]:
anime_df["Score"] = anime_df["Score"].map(lambda x:np.nan if x == "Unknown" else x)

In [105]:
anime_df["Score"].fillna(anime_df["Score"].median(),inplace = True)

In [106]:
anime_df["Score"] = anime_df["Score"].astype(float)

In [107]:
anime_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   MAL_ID     16206 non-null  int64  
 1   Name       16206 non-null  object 
 2   Score      16206 non-null  float64
 3   Genres     16206 non-null  object 
 4   sypnopsis  16206 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 759.7+ KB


In [108]:
anime_df['Genres'] = anime_df['Genres'].apply(lambda x: x.split())
anime_df['sypnopsis'] = anime_df['sypnopsis'].apply(lambda x: x.split())

In [109]:
anime_df['Genres'] = anime_df['Genres'].apply(lambda x: [i.replace(" ","") for i in x])
anime_df['sypnopsis'] = anime_df['sypnopsis'].apply(lambda x: [i.replace(" ","") for i in x])

In [110]:
anime_df.head(10)

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"[Action,, Adventure,, Comedy,, Drama,, Sci-Fi,...","[In, the, year, 2071,, humanity, has, colonize..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"[Action,, Drama,, Mystery,, Sci-Fi,, Space]","[other, day,, another, bounty—such, is, the, l..."
2,6,Trigun,8.24,"[Action,, Sci-Fi,, Adventure,, Comedy,, Drama,...","[Vash, the, Stampede, is, the, man, with, a, $..."
3,7,Witch Hunter Robin,7.27,"[Action,, Mystery,, Police,, Supernatural,, Dr...","[ches, are, individuals, with, special, powers..."
4,8,Bouken Ou Beet,6.98,"[Adventure,, Fantasy,, Shounen,, Supernatural]","[It, is, the, dark, century, and, the, people,..."
5,15,Eyeshield 21,7.95,"[Action,, Sports,, Comedy,, Shounen]","[Sena, is, like, any, other, shy, kid, startin..."
6,16,Hachimitsu to Clover,8.06,"[Comedy,, Drama,, Josei,, Romance,, Slice, of,...","[Yuuta, Takemoto,, a, sophomore, at, an, arts,..."
7,17,Hungry Heart: Wild Striker,7.59,"[Slice, of, Life,, Comedy,, Sports,, Shounen]","[Kyosuke, Kano, has, lived, under, the, shadow..."
8,18,Initial D Fourth Stage,8.15,"[Action,, Cars,, Sports,, Drama,, Seinen]","[Takumi, Fujiwara, finally, joins, Ryousuke, a..."
9,19,Monster,8.76,"[Drama,, Horror,, Mystery,, Police,, Psycholog...","[Dr., Kenzou, Tenma,, an, elite, neurosurgeon,..."


In [111]:
anime_df['features'] = anime_df['Genres'] + anime_df['sypnopsis'] 
     
new_anime_df = anime_df[['Name', 'features']]

In [None]:
new_anime_df['features'] = new_anime_df['features'].apply(lambda x: " ".join(x))

In [113]:
#Stemming
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
new_anime_df['features'] = new_anime_df['features'].apply(stem)
new_anime_df['features'] = new_anime_df['features'].apply(lambda x: x.lower())

In [115]:
#Countvectorizer

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_anime_df['features']).toarray()

In [116]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 3, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [117]:
similarity = cosine_similarity(vectors)

In [None]:
#stores every similarity index as a tuple
sorted(list(enumerate(similarity[0])),reverse = True, key = lambda x: x[1])

In [217]:
def fetch_similar_anime(anime):
    similar_anime = new_anime_df[new_anime_df['Name'].str.contains(anime, case = False)]
    
    if similar_anime.empty:
        print(f"No similar anime found for '{anime}' in the database.")
        return
    
    similar_anime_names = similar_anime['Name'].tolist()
    for i in similar_anime_names:
        print(i)

In [218]:
fetch_similar_anime("Gintama")

Gintama
Gintama: Nanigoto mo Saiyo ga Kanjin nano de Tasho Senobisuru Kurai ga Choudoyoi
Gintama: Shiroyasha Koutan
Gintama Movie 1: Shinyaku Benizakura-hen
Gintama: Shinyaku Benizakura-hen
Gintama'
Gintama: Dai Hanseikai
Gintama Movie 2: Kanketsu-hen - Yorozuya yo Eien Nare
Gintama': Enchousen
Gintama x Mameshiba
Gintama: Yorinuki Gintama-san on Theater 2D
Gintama': Futon ni Haitte kara Buki Nokoshi ni Kizuite Neru ni Nerenai Toki mo Aru
Gintama°
Gintama°: Umai-mono wa Atomawashi ni Suru to Yokodorisareru kara Yappari Saki ni Kue
Gintama°: Aizome Kaori-hen
Gintama.
Gintama.: Porori-hen
Gintama.: Shirogane no Tamashii-hen
Gintama.: Shirogane no Tamashii-hen - Kouhan-sen
Gintama: The Final
Gintama: Monster Strike-hen
Gintama: The Semi-Final


In [194]:
def anime_recommender(anime):
    if anime not in new_anime_df['Name'].values:
        print(f"The anime '{anime}' is not found in the database.")
        return
    
    movie_index = new_anime_df[new_anime_df['Name'] == anime].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:16]
    
    for i in movies_list:
        recommended_anime_index = i[0]
        similarity_percentage = i[1] * 100
        recommended_anime_name = new_anime_df.iloc[recommended_anime_index].Name
        print(f"{recommended_anime_name : <63}{format(similarity_percentage, '.2f')}% match")

In [214]:
anime_recommender("Dragon Ball")

Dragon Ball Kai                                                41.00% match
Dragon Ball GT                                                 39.97% match
Dragon Ball Movie 3: Makafushigi Daibouken                     36.90% match
Dragon Ball: Super Saiya-jin Zetsumetsu Keikaku                34.97% match
Dragon Ball Z: Atsumare! Gokuu World                           34.22% match
Dragon Ball Z                                                  32.72% match
Pokemon Movie 06: Nanayo no Negaiboshi Jirachi                 32.20% match
Ikkitousen: Dragon Destiny Specials                            31.23% match
Souryuuden                                                     30.12% match
Dragon Ball Super Movie: Broly                                 29.77% match
Shin Hurricane Polymar                                         29.63% match
Hatenkou Yuugi                                                 28.92% match
Dragon Ball Super                                              28.48% match
Dragon Ball 