# Load the dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [4]:
anime = pd.read_csv("anime.csv")

In [12]:
anime.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [13]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

# Data preprocessing 

## Episodes

Many animes have unknown number of episodes even if they have similar rating. On top of that many super popular animes such as Naruto Shippuden, Attack on Titan Season 2 were ongoing when the data was collected, thus their number of episodes was considered as "Unknown". For some of my favorite animes I've filled in the episode numbers manually. For the other anime's, I had to make some educated guesses. Changes I've made are :

Animes that are grouped under Hentai Categories generally have 1 episode in my experience. So I've filled the unknown values with 1.

Animes that are grouped are "OVA" stands for "Original Video Animation". These are generally one/two episode long animes(often the popular ones have 2/3 episodes though), but I've decided to fill the unknown numbers of episodes with 1 again.

Animes that are grouped under "Movies" are considered as '1' episode as per the dataset overview goes.

For all the other animes with unknown number of episodes, I've filled the na values with the median which is 2.


In [14]:
anime[anime['episodes']=='Unknown'].head(3)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578


In [15]:
anime.loc[(anime["genre"]=="Hentai") & (anime["episodes"]=="Unknown"),"episodes"] = "1"
anime.loc[(anime["type"]=="OVA") & (anime["episodes"]=="Unknown"),"episodes"] = "1"

anime.loc[(anime["type"] == "Movie") & (anime["episodes"] == "Unknown")] = "1"

In [16]:
known_animes = {"Naruto Shippuuden":500, "One Piece":784,"Detective Conan":854, "Dragon Ball Super":86,
                "Crayon Shin chan":942, "Yu Gi Oh Arc V":148,"Shingeki no Kyojin Season 2":25,
                "Boku no Hero Academia 2nd Season":25,"Little Witch Academia TV":25}


In [17]:
for k,v in known_animes.items():    
    anime.loc[anime["name"]==k,"episodes"] = v

In [18]:
anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)


In [19]:
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)


### Type

In [20]:
pd.get_dummies(anime[["type"]]).head()


Unnamed: 0,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


### Rating, Members and Genre

For members feature, I Just converted the strings to float.Episode numbers, members and rating are different from categorical variables and very different in values. Rating ranges from 0-10 in the dataset while the episode number can be even 800+ episodes long when it comes to long running popular animes such as One Piece, Naruto etc. So I ended up using sklearn.preprocessing.MinMaxScaler as it scales the values from 0-1.Many animes have unknown ratings. These were filled with the median of the ratings.



In [21]:
anime["rating"] = anime["rating"].astype(float)
anime["rating"].fillna(anime["rating"].median(),inplace = True)
anime["members"] = anime["members"].astype(float)


In [22]:
# Scaling

anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime[["members"]],anime["episodes"]],axis=1)
anime["name"] = anime["name"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [23]:
anime_features.columns


Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', '1', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery',
       'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School',
       'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space',
       'Sports', 'Super Power', 'Supernat

In [24]:
from sklearn.preprocessing import MinMaxScaler


In [25]:
min_max_scaler = MinMaxScaler()
anime_features = min_max_scaler.fit_transform(anime_features)

In [30]:
np.round(anime_features,2)

array([[ 0.  ,  0.  ,  0.  , ...,  0.93,  0.2 ,  0.  ],
       [ 1.  ,  0.  ,  0.  , ...,  0.92,  0.78,  0.03],
       [ 0.  ,  0.  ,  1.  , ...,  0.92,  0.11,  0.03],
       ..., 
       [ 0.  ,  0.  ,  0.  , ...,  0.43,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.44,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , ...,  0.5 ,  0.  ,  0.  ]])

# Fit Nearest Neighbor To Data

In [17]:
from sklearn.neighbors import NearestNeighbors


In [18]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features)


In [19]:
distances, indices = nbrs.kneighbors(anime_features)


# Query examples and helper functions

Many anime names have not been documented properly and in many cases the names are in Japanese instead of English and the spelling is often different. For that reason I've also created another helper function get_id_from_partial_name to find out ids of the animes from part of names.

In [20]:
def get_index_from_name(name):
    return anime[anime["name"]==name].index.tolist()[0]
    

In [24]:
all_anime_names = list(anime.name.values)


In [25]:
def get_id_from_partial_name(partial):
    for name in all_anime_names:
        if partial in name:
            print(name,all_anime_names.index(name))

In [26]:
""" print_similar_query can search for similar animes both by id and by name. """

def print_similar_animes(query=None,id=None):
    if id:
        for id in indices[id][1:]:
            print(anime.ix[id]["name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.ix[id]["name"])

# Query Examples 

In [27]:
print_similar_animes(query="Naruto")


Naruto Shippuuden
Katekyo Hitman Reborn 
Bleach
Dragon Ball Z
Boku no Hero Academia


In [28]:
print_similar_animes("Noragami")


Noragami Aragoto
JoJo no Kimyou na Bouken TV 
JoJo no Kimyou na Bouken Stardust Crusaders
JoJo no Kimyou na Bouken Stardust Crusaders 2nd Season
Yumekui Merry


In [29]:
print_similar_animes("Mushishi")


Mushishi Zoku Shou
Mushishi Zoku Shou 2nd Season
Mushishi Special Hihamukage
Mushishi Zoku Shou Odoro no Michi
Mushishi Zoku Shou Suzu no Shizuku


In [30]:
print_similar_animes("Gintama")


Gintama 039 
Gintama 
Gintama 039 Enchousen
Gintama 2017 
Gintama Movie Kanketsu hen Yorozuya yo Eien Nare


In [31]:
print_similar_animes("Fairy Tail")


Fairy Tail 2014 
Magi The Labyrinth of Magic
Magi The Kingdom of Magic
Densetsu no Yuusha no Densetsu
Magi Sinbad no Bouken TV 


In [32]:
get_id_from_partial_name("Naruto")


Boruto Naruto the Movie 486
Naruto Shippuuden 615
The Last Naruto the Movie 719
Naruto Shippuuden Movie 6 Road to Ninja 784
Naruto 841
Boruto Naruto the Movie Naruto ga Hokage ni Natta Hi 1103
Naruto Shippuuden Movie 5 Blood Prison 1237
Naruto x UT 1343
Naruto Shippuuden Movie 4 The Lost Tower 1472
Naruto Shippuuden Movie 3 Hi no Ishi wo Tsugu Mono 1573
Naruto Shippuuden Movie 1 1827
Naruto Shippuuden Movie 2 Kizuna 1828
Naruto Shippuuden Shippuu quot Konoha Gakuen quot Den 2374
Naruto Honoo no Chuunin Shiken Naruto vs Konohamaru  2416
Naruto SD Rock Lee no Seishun Full Power Ninden 2457
Naruto Shippuuden Sunny Side Battle 2458
Naruto Movie 1 Dai Katsugeki Yuki Hime Shinobu Houjou Dattebayo  2756
Naruto Soyokazeden Movie Naruto to Mashin to Mitsu no Onegai Dattebayo  2997
Naruto Movie 2 Dai Gekitotsu Maboroshi no Chiteiiseki Dattebayo  3449
Naruto Dai Katsugeki Yuki Hime Shinobu Houjou Dattebayo Special Konoha Annual Sports Festival 3529
Naruto Movie 3 Dai Koufun Mikazuki Jima no Anima

In [33]:
print_similar_animes(id=719)


Naruto Shippuuden Movie 6 Road to Ninja
Boruto Naruto the Movie
Naruto Shippuuden Movie 4 The Lost Tower
Naruto Shippuuden Movie 3 Hi no Ishi wo Tsugu Mono
Naruto Honoo no Chuunin Shiken Naruto vs Konohamaru 


In [34]:
print_similar_animes("Kimi no Na wa ")


Kokoro ga Sakebitagatterunda 
Harmonie
Air Movie
Hotarubi no Mori e
Momo e no Tegami
