In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import NearestNeighbors
import time
from sqlalchemy import create_engine

%matplotlib inline

In [4]:
# read data from database
engine = create_engine('<Database sqlalchemy>')
sql = "select * from anime"
df_anime = pd.read_sql_query(sql, engine)

In [5]:
df_anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


## Data preprocess
- 1. Deal with episodes

In [6]:
df_anime.loc[df_anime["episodes"]=="Unknown"].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,Unknown,7.73,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.72,5400


In [7]:
print('There are %d records that didnt have info about episodes'%(len(df_anime.loc[df_anime["episodes"]=="Unknown"])))

There are 340 records that didnt have info about episodes


- In this case, we will deal with that in the following ways:
    - Movies should have only one episode
    - OVA should have only one episodes
    - Anime of Hentai categories should also contain one episode
    - FOr other anime with 'Unkown' episodes, we filled them with median.

In [8]:
df_anime.loc[(df_anime["genre"]=="Hentai") & (df_anime["episodes"]=="Unknown"),"episodes"] = "1"
df_anime.loc[(df_anime["type"]=="OVA") & (df_anime["episodes"]=="Unknown"),"episodes"] = "1"

df_anime.loc[(df_anime["type"] == "Movie") & (df_anime["episodes"] == "Unknown")] = "1"

In [9]:
len(df_anime.loc[df_anime["episodes"]=="Unknown"])

286

In [10]:
known_animes = {"Naruto Shippuuden":500, "One Piece":784,"Detective Conan":854, "Dragon Ball Super":86,
                "Crayon Shin chan":942, "Yu Gi Oh Arc V":148,"Shingeki no Kyojin Season 2":25,
                "Boku no Hero Academia 2nd Season":25,"Little Witch Academia TV":25}

In [11]:
for k,v in known_animes.items():    
    df_anime.loc[df_anime["name"]==k,"episodes"] = v
    
df_anime["episodes"] = df_anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)
df_anime["episodes"] = df_anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)
df_anime["episodes"].fillna(df_anime["episodes"].median(),inplace = True)

In [12]:
assert len(df_anime.loc[df_anime["episodes"]=="Unknown"]) == 0

- 2.Rating:
    - There are also unknown ratings in some anime.
    - We also want to fill them with median as well

In [13]:
df_anime[df_anime['rating'].isnull()].head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
8968,34502,Inazma Delivery,"Action, Comedy, Sci-Fi",TV,10,,32
9657,34309,Nananin no Ayakashi: Chimi Chimi Mouryou!! Gen...,"Comedy, Supernatural",TV,2,,129
10896,34096,Gintama (2017),"Action, Comedy, Historical, Parody, Samurai, S...",TV,2,,13383
10897,34134,One Punch Man 2,"Action, Comedy, Parody, Sci-Fi, Seinen, Super ...",TV,2,,90706
10898,30484,Steins;Gate 0,"Sci-Fi, Thriller",,2,,60999


In [14]:
print('There are %d records containing unkown ratings' %(len(df_anime[df_anime['rating'].isnull()])))

There are 227 records containing unkown ratings


In [15]:
df_anime["rating"] = df_anime["rating"].astype(float)

In [16]:
df_anime["rating"].fillna(df_anime["rating"].median(),inplace = True)

In [17]:
assert len(df_anime[df_anime['rating'].isnull()]) == 0

- 3.Categorical
    - Based on the observation, we can deal with type as categorical matrixs

In [18]:
pd.get_dummies(df_anime[["type"]]).head()

Unnamed: 0,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


- 4.Convert string into float

In [19]:
df_anime["members"] = df_anime["members"].astype(float)

## Feature Engineering

In [20]:
anime_features = pd.concat([df_anime["genre"].str.get_dummies(sep=","), \
                            pd.get_dummies(df_anime[["type"]]),df_anime[["rating"]],df_anime[["members"]], \
                            df_anime["episodes"]], \
                            axis=1)

In [21]:
assert len(anime_features) == len(df_anime)

In [22]:
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [23]:
df_anime["name"] = df_anime["name"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))

In [24]:
### check features
print('There are %d features in this project' %(len(anime_features.columns)))
print(anime_features.columns)

There are 93 features in this project
Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', '1', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery',
       'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School',
       'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space',
   

- Normalization

In [25]:
max_abs_scaler = MaxAbsScaler()
anime_features = max_abs_scaler.fit_transform(anime_features)

In [26]:
print(anime_features)
anime_features.shape

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 9.37000000e-01
  1.97876158e-01 5.50055006e-04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 ... 9.26000000e-01
  7.82771174e-01 3.52035204e-02]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 ... 9.25000000e-01
  1.12693643e-01 2.80528053e-02]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 4.88000000e-01
  2.15994011e-04 2.20022002e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 4.98000000e-01
  1.72597954e-04 5.50055006e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 5.46000000e-01
  1.40050911e-04 5.50055006e-04]]


(12294, 93)

## Deal with similarity with KNN

In [27]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(anime_features)

In [28]:
distances, indices = nbrs.kneighbors(anime_features)

In [29]:
ind_df = pd.DataFrame(indices[:,1:], columns=['id'+str(i) for i in range(1,len(indices[0]))])

In [30]:
ind_df['id0'] = df_anime['anime_id'][indices[:,0]].values

## Save results to database

In [31]:
ind_df.to_sql('content-based', con=engine, index=False)

## Query Example

In [32]:
def get_index_from_name(name):
    return df_anime[df_anime["name"]==name].index.tolist()[0]

In [33]:
### eg:
inx_conan = get_index_from_name("Detective Conan")
print(inx_conan)

252


In [34]:
distances[inx_conan]

array([0.        , 1.11184871, 1.11347792, 1.11494572, 1.11576665,
       1.49184305])

In [35]:
indices[inx_conan]

array([ 252, 2482, 3066, 3653, 2951, 1713], dtype=int64)

In [36]:
all_anime_names = list(df_anime.name.values)

In [37]:
def get_id_from_partial_name(partial):
    ID = []
    for name in all_anime_names:
        if partial in name:
            print(name,all_anime_names.index(name))
            ID.append((name,all_anime_names.index(name)))
    return ID

In [38]:
""" print_similar_query can search for similar animes both by id and by name. """

def print_similar_animes(query=None,id=None):
    similar_result = []
    if id:
        for id in indices[id][1:]:
            print(df_anime.iloc[id]["name"])
            similar_result.append(df_anime.iloc[id]["name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(df_anime.iloc[id]["name"])
            similar_result.append(df_anime.iloc[id]["name"])
    return similar_result

In [39]:
print_similar_animes(query="Detective Conan")

Kaitou Joker 2nd Season
Kaitou Joker 3rd Season
Kaitou Joker
Kaitou Joker 4th Season
Meitantei Holmes


['Kaitou Joker 2nd Season',
 'Kaitou Joker 3rd Season',
 'Kaitou Joker',
 'Kaitou Joker 4th Season',
 'Meitantei Holmes']

In [40]:
print_similar_animes("Noragami")

Noragami Aragoto
JoJo no Kimyou na Bouken TV 
JoJo no Kimyou na Bouken Stardust Crusaders
JoJo no Kimyou na Bouken Stardust Crusaders 2nd Season
Yumekui Merry


['Noragami Aragoto',
 'JoJo no Kimyou na Bouken TV ',
 'JoJo no Kimyou na Bouken Stardust Crusaders',
 'JoJo no Kimyou na Bouken Stardust Crusaders 2nd Season',
 'Yumekui Merry']

In [41]:
get_id_from_partial_name("Detective Conan")

Detective Conan Movie 06 The Phantom of Baker Street 137
Detective Conan Movie 13 The Raven Chaser 165
Detective Conan Movie 20 The Darkest Nightmare 205
Detective Conan Movie 14 The Lost Ship in the Sky 227
Detective Conan 252
Detective Conan Movie 05 Countdown to Heaven 253
Detective Conan Movie 08 Magician of the Silver Sky 274
Detective Conan Movie 10 Requiem of the Detectives 293
Detective Conan Movie 18 The Sniper from Another Dimension 332
Detective Conan Movie 03 The Last Wizard of the Century 344
Detective Conan Movie 04 Captured in Her Eyes 345
Detective Conan Movie 15 Quarter of Silence 378
Detective Conan OVA 09 The Stranger in 10 Years  454
Lupin III vs Detective Conan The Movie 472
Detective Conan Movie 02 The Fourteenth Target 524
Detective Conan Movie 01 The Timed Skyscraper 544
Detective Conan Movie 07 Crossroad in the Ancient Capital 545
Detective Conan Movie 08 Time Travel of the Silver Sky 586
Detective Conan OVA 10 Kid in Trap Island 587
Detective Conan Movie 09 St

[('Detective Conan Movie 06 The Phantom of Baker Street', 137),
 ('Detective Conan Movie 13 The Raven Chaser', 165),
 ('Detective Conan Movie 20 The Darkest Nightmare', 205),
 ('Detective Conan Movie 14 The Lost Ship in the Sky', 227),
 ('Detective Conan', 252),
 ('Detective Conan Movie 05 Countdown to Heaven', 253),
 ('Detective Conan Movie 08 Magician of the Silver Sky', 274),
 ('Detective Conan Movie 10 Requiem of the Detectives', 293),
 ('Detective Conan Movie 18 The Sniper from Another Dimension', 332),
 ('Detective Conan Movie 03 The Last Wizard of the Century', 344),
 ('Detective Conan Movie 04 Captured in Her Eyes', 345),
 ('Detective Conan Movie 15 Quarter of Silence', 378),
 ('Detective Conan OVA 09 The Stranger in 10 Years ', 454),
 ('Lupin III vs Detective Conan The Movie', 472),
 ('Detective Conan Movie 02 The Fourteenth Target', 524),
 ('Detective Conan Movie 01 The Timed Skyscraper', 544),
 ('Detective Conan Movie 07 Crossroad in the Ancient Capital', 545),
 ('Detective 

In [42]:
print_similar_animes("Kimi no Na wa ")

Kokoro ga Sakebitagatterunda 
Harmonie
Air Movie
Hotarubi no Mori e
Clannad Movie


['Kokoro ga Sakebitagatterunda ',
 'Harmonie',
 'Air Movie',
 'Hotarubi no Mori e',
 'Clannad Movie']