In [1]:
import numpy as np
import pandas as pd

In [2]:
# https://www.kaggle.com/CooperUnion/anime-recommendations-database
df_anime = pd.read_csv('anime.csv')
df_rating = pd.read_csv('rating.csv')

In [3]:
print(df_anime.shape)
print(df_rating.shape)

(12294, 7)
(7813737, 3)


In [4]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df_anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_vec = TfidfVectorizer(min_df= 3, token_pattern= r'\w{1,}', stop_words= 'english', ngram_range= (1, 3), strip_accents= 'unicode')

df_anime.dropna(inplace= True)
anime_genres = df_anime['genre'].str.split(',').astype(str)
tf_idf_vec_matrix = tf_idf_vec.fit_transform(anime_genres)

In [7]:
import re

def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'☆', '', text)
    return text

df_anime['name'] = df_anime['name'].apply(text_cleaning)

In [8]:
from sklearn.metrics.pairwise import sigmoid_kernel

sigmoid = sigmoid_kernel(tf_idf_vec_matrix)

indices = pd.Series(df_anime.index, index= df_anime['name']).drop_duplicates()

In [9]:
def recommend(title, sig = sigmoid):
    index = indices[title]
    sig_score = list(enumerate(sig[index]))
    sig_score = sorted(sig_score, key= lambda x: x[1], reverse= True)
    anime_indices = [i[0] for i in sig_score[:11]]
    return pd.DataFrame({'Anime': df_anime['name'].iloc[anime_indices].values, 'Rating': df_anime['rating'].iloc[anime_indices].values})

In [14]:
try:
    print(recommend('Sword Art Online'))
except:
    print('No recommend is found')

                                          Anime  Rating
0                              Sword Art Online    7.83
1                           Sword Art Online II    7.35
2               Sword Art Online: Extra Edition    7.00
3               Sword Art Online II: Debriefing    6.56
4                         Ys IV: The Dawn of Ys    6.12
5   Bakugan Battle Brawlers: Gundalian Invaders    6.32
6     Bakugan Battle Brawlers: Mechtanium Surge    6.29
7                           Pokemon Generations    7.21
8                                   Log Horizon    8.14
9                        Log Horizon 2nd Season    7.66
10                                     Overlord    8.04
