In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functions.textprocess import process 
import pandas as pd
import pickle

In [2]:
column_names = ['user_id', 'anime_id', 'rating']
userdf = pd.read_csv('data/rating_complete.csv', sep=',', names=column_names, skiprows=[0])
userdf.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


In [3]:
userdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


In [4]:
anime_votes = userdf.groupby('anime_id').size().reset_index(name='vote_count')
anime_votes.rename(columns={"anime_id":"MAL_ID"},inplace=True)
anime_votes.head()

Unnamed: 0,MAL_ID,vote_count
0,1,79714
1,5,33613
2,6,46970
3,7,10224
4,8,1514


In [5]:

df = pd.read_csv("data/anime_with_synopsis.csv")
df.dropna(inplace=True,subset=['sypnopsis'])
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16206 entries, 0 to 16213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   MAL_ID     16206 non-null  int64 
 1   Name       16206 non-null  object
 2   Score      16206 non-null  object
 3   Genres     16206 non-null  object
 4   sypnopsis  16206 non-null  object
dtypes: int64(1), object(4)
memory usage: 759.7+ KB


In [7]:
df = pd.merge(df, anime_votes, on='MAL_ID')
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis,vote_count
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",79714
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",33613
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",46970
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,10224
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,1514


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15547 entries, 0 to 15546
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   MAL_ID      15547 non-null  int64 
 1   Name        15547 non-null  object
 2   Score       15547 non-null  object
 3   Genres      15547 non-null  object
 4   sypnopsis   15547 non-null  object
 5   vote_count  15547 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 728.9+ KB


In [10]:
df = df[df['vote_count'] > 500]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6334 entries, 0 to 15543
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   MAL_ID      6334 non-null   int64 
 1   Name        6334 non-null   object
 2   Score       6334 non-null   object
 3   Genres      6334 non-null   object
 4   sypnopsis   6334 non-null   object
 5   vote_count  6334 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 346.4+ KB


In [11]:
df['tags'] = df['sypnopsis']+df['Genres']
df['tags'] = df['tags'].apply(process)
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis,vote_count,tags
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever...",79714,in the year humanity has colonized sever...
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ...",33613,other day another bounty such is the life of ...
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0...",46970,vash the stampede is the man with a ...
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...,10224,ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...,1514,it is the dark century and the people are suff...


In [12]:
df.sort_values('vote_count',ascending=False)

Unnamed: 0,MAL_ID,Name,Score,Genres,sypnopsis,vote_count,tags
1333,1535,Death Note,8.63,"Mystery, Police, Psychological, Supernatural, ...","shinigami, as a god of death, can kill any per...",182375,shinigami as a god of death can kill any per...
6538,16498,Shingeki no Kyojin,8.48,"Action, Military, Mystery, Super Power, Drama,...","Centuries ago, mankind was slaughtered to near...",169794,centuries ago mankind was slaughtered to near...
5776,11757,Sword Art Online,7.25,"Action, Game, Adventure, Romance, Fantasy","In the year 2022, virtual reality has progress...",161192,in the year virtual reality has progress...
3974,6547,Angel Beats!,8.15,"Action, Comedy, Drama, School, Supernatural",Otonashi awakens only to learn he is dead. A r...,141127,otonashi awakens only to learn he is dead a r...
9363,30276,One Punch Man,8.57,"Action, Sci-Fi, Comedy, Parody, Super Power, S...",The seemingly ordinary and unimpressive Saitam...,138924,the seemingly ordinary and unimpressive saitam...
...,...,...,...,...,...,...,...
3177,4389,Amada Anime Series: Super Mario Brothers,4.89,"Adventure, Comedy, Shounen",The Super Mario Amada Series are a series of s...,502,the super mario amada series are a series of s...
4347,7518,Boku wa Kuma,5.74,"Music, Slice of Life, Kids",stop motion animation produced by NHK. A color...,502,stop motion animation produced by nhk a color...
2820,3602,Ginga Tetsudou 999: Eternal Fantasy,6.63,"Action, Drama, Sci-Fi, Space",Based on the story by manga master Leiji Matsu...,501,based on the story by manga master leiji matsu...
3024,3990,Kumo to Tulip,5.73,Adventure,spider attempts to lure a ladybug into his web...,501,spider attempts to lure a ladybug into his web...


In [13]:
cv=CountVectorizer(max_features=10000, stop_words='english')
vector=cv.fit_transform(df['tags'].values.astype('U')).toarray()

In [14]:
similarity=cosine_similarity(vector)

In [15]:
def recommand(anime,count):
    id = df[df['Name']==anime].index[0]
    distance = sorted(list(enumerate(similarity[id])), reverse=True, key=lambda vector:vector[1])
    for i in distance[1:count]:
        print(df.iloc[i[0]].Name)

In [16]:
#PREDICT
recommand("Sword Art Online",10)

Soukyuu no Fafner: Dead Aggressor - Exodus
No Game No Life: Zero
Gasaraki
Malice@Doll
Je T'aime
Strike Witches
Kishin Taisen Gigantic Formula
Karen Senki
Betterman


In [17]:
# Save 
pickle.dump(df, open('pickle/anime_list.pkl', 'wb'))
pickle.dump(similarity, open('pickle/similarity.pkl', 'wb'))
