In [1]:
import numpy as np
import pandas as pd

In [None]:
raw_data_anime = pd.read_csv('./anime.csv')
raw_data_rating = pd.read_csv('./rating.csv')

In [3]:
raw_data_anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


- anime_id: myanimelist.net에서 애니메이션을 식별하는 고유 ID.
- name: 애니메이션의 전체 이름.
- genre: 이 애니메이션의 장르를 쉼표로 구분한 목록.
- type: 영화, TV, OVA 등 애니메이션의 유형.
- episodes: 이 애니메이션의 에피소드 수 (영화일 경우 1).
- rating: 이 애니메이션의 평균 평점 (10점 만점).
- members: 이 애니메이션 커뮤니티에 속한 회원 수.

In [4]:
raw_data_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
raw_data_rating.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


- user_id: 식별 불가능한 무작위로 생성된 사용자 ID.
- anime_id: 사용자가 평가한 애니메이션.
- rating: 사용자가 부여한 10점 만점의 평점 (-1은 사용자가 시청했지만 평점을 부여하지 않은 경우).

In [6]:
raw_data_anime['name'] = raw_data_anime.name.str.replace('&qout;', '')
raw_data_anime['name'] = raw_data_anime.name.str.replace('&#039;', '')
raw_data_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


- 애니메이션 데이터 셋의 제목에는 특수문자가 Html 문자 형태로 섞여있는 것을 확인
- '&quot';와 #039 2개를 name에서 모두 지워줌.

In [7]:
data_user_rating = raw_data_rating[raw_data_rating['rating'] != -1]
data_user_rating.reset_index(drop=True, inplace = True)
data_user_rating

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10
...,...,...,...
6337236,73515,16512,7
6337237,73515,17187,9
6337238,73515,22145,10
6337239,73516,790,9


- rating 데이터에 유저가 평가하지 않은 정보 포함
- 유저가 평가 하지 않은 정보는 제외함.

In [8]:
rated_anime_id_list = data_user_rating["anime_id"].unique()

raw_data_anime = raw_data_anime[raw_data_anime["anime_id"].isin(rated_anime_id_list)]
raw_data_anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12288,5541,The Satisfaction,Hentai,OVA,1,4.37,166
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219


- data_user_rating에 id 중 실제 매칭이 되는 애니메이션 id값만 추출을 진행

In [9]:
type_list = raw_data_anime['type'].unique()
tag_list = type_list
tag_list

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA', nan], dtype=object)

In [10]:
type_list = raw_data_anime['type'].unique()[:-1]
tag_list = type_list
tag_list

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

- 가장 마지막에 nan이 들어가 있음

In [11]:
raw_data_anime.loc[:, tag_list] = '-'

for t in type_list:
    raw_data_anime.loc[:, t] = np.where(raw_data_anime['type'] == t, "o", "-")

raw_data_anime = raw_data_anime.drop(columns=['type', 'episodes', 'members'])

raw_data_anime

Unnamed: 0,anime_id,name,genre,rating,Movie,TV,OVA,Special,Music,ONA
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",9.37,o,-,-,-,-,-
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",9.26,-,o,-,-,-,-
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25,-,o,-,-,-,-
3,9253,Steins;Gate,"Sci-Fi, Thriller",9.17,-,o,-,-,-,-
4,9969,Gintama,"Action, Comedy, Historical, Parody, Samurai, S...",9.16,-,o,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...
12288,5541,The Satisfaction,Hentai,4.37,-,-,o,-,-,-
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,4.15,-,-,o,-,-,-
12290,5543,Under World,Hentai,4.28,-,-,o,-,-,-
12291,5621,Violence Gekiga David no Hoshi,Hentai,4.88,-,-,o,-,-,-


In [12]:
genre_tag_dict = {}
genre_list = []

for s in raw_data_anime.genre.str.split(', ') :
    # NaN은 기본적으로 float 값을 가진다.
    if type(s) == type(0.1):
        continue

    for g in s :
        genre_tag_dict[g] = 0

for key, val in genre_tag_dict.items():
    genre_list.append(key)

tag_list = tag_list.tolist()
tag_list += genre_list
genre_list

['Drama',
 'Romance',
 'School',
 'Supernatural',
 'Action',
 'Adventure',
 'Fantasy',
 'Magic',
 'Military',
 'Shounen',
 'Comedy',
 'Historical',
 'Parody',
 'Samurai',
 'Sci-Fi',
 'Thriller',
 'Sports',
 'Super Power',
 'Space',
 'Slice of Life',
 'Mecha',
 'Music',
 'Mystery',
 'Seinen',
 'Martial Arts',
 'Vampire',
 'Shoujo',
 'Horror',
 'Police',
 'Psychological',
 'Demons',
 'Ecchi',
 'Josei',
 'Shounen Ai',
 'Game',
 'Dementia',
 'Harem',
 'Cars',
 'Kids',
 'Shoujo Ai',
 'Hentai',
 'Yaoi',
 'Yuri']

In [13]:
for g in genre_list :
    raw_data_anime[g] = np.where(raw_data_anime.genre.str.find(g) != -1, "o", "-")

data_ani = raw_data_anime.drop(columns=['genre'])
data_ani = data_ani.set_index("anime_id")
data_ani

Unnamed: 0_level_0,name,rating,Movie,TV,OVA,Special,Music,ONA,Drama,Romance,...,Shounen Ai,Game,Dementia,Harem,Cars,Kids,Shoujo Ai,Hentai,Yaoi,Yuri
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,Kimi no Na wa.,9.37,o,-,-,-,-,-,o,o,...,-,-,-,-,-,-,-,-,-,-
5114,Fullmetal Alchemist: Brotherhood,9.26,-,o,-,-,-,-,o,-,...,-,-,-,-,-,-,-,-,-,-
28977,Gintama°,9.25,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
9253,Steins;Gate,9.17,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
9969,Gintama,9.16,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5541,The Satisfaction,4.37,-,-,o,-,-,-,-,-,...,-,-,-,-,-,-,-,o,-,-
9316,Toushindai My Lover: Minami tai Mecha-Minami,4.15,-,-,o,-,-,-,-,-,...,-,-,-,-,-,-,-,o,-,-
5543,Under World,4.28,-,-,o,-,-,-,-,-,...,-,-,-,-,-,-,-,o,-,-
5621,Violence Gekiga David no Hoshi,4.88,-,-,o,-,-,-,-,-,...,-,-,-,-,-,-,-,o,-,-


In [14]:
data_merge = pd.merge(data_user_rating, data_ani['name'], on = "anime_id")
data_merge

Unnamed: 0,user_id,anime_id,rating,name
0,1,8074,10,Highschool of the Dead
1,1,11617,10,High School DxD
2,1,11757,10,Sword Art Online
3,1,15451,10,High School DxD New
4,2,11771,10,Kuroko no Basket
...,...,...,...,...
6337234,73515,16512,7,Devil Survivor 2 The Animation
6337235,73515,17187,9,Ghost in the Shell: Arise - Border:1 Ghost Pain
6337236,73515,22145,10,Kuroshitsuji: Book of Circus
6337237,73516,790,9,Ergo Proxy


In [15]:
rating_matrix = data_merge.pivot_table(values='rating', index = 'user_id', columns='name')

rating_matrix

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,,,,,,,,,,,...,,,,,,,,,,
73513,,,,,,,,,,,...,,,,,,,,,,
73514,,,,,,,,,,,...,,,,,,,,,,
73515,,,,,,,,,,,...,,,9.0,,,,,,,


In [16]:
rating_matrix = rating_matrix.fillna(0)
rating_matrix

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,...,lilac (bombs Jun Togawa),makemagic,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
rating_matrix_numpy = rating_matrix.values.T
rating_matrix_numpy.shape

(9922, 69600)

In [19]:
usabser_count, ani_count = rating_matrix_numpy.shape

from sklearn.decomposition import TruncatedSVD
SVD = TruncatedSVD(n_components =12)
matrix = SVD.fit_transform(rating_matrix_numpy)
matrix.shape

(9922, 12)

In [20]:
corr = np.corrcoef(matrix)
corr.shape

(9922, 9922)

In [21]:
ani_title_list = list(rating_matrix.columns)
#ani_title_list

['&quot;0&quot;',
 '&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu',
 '&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi',
 '&quot;Bungaku Shoujo&quot; Memoire',
 '&quot;Bungaku Shoujo&quot; Movie',
 '&quot;Eiji&quot;',
 '.hack//G.U. Returner',
 '.hack//G.U. Trilogy',
 '.hack//G.U. Trilogy: Parody Mode',
 '.hack//Gift',
 '.hack//Intermezzo',
 '.hack//Liminality',
 '.hack//Quantum',
 '.hack//Quantum: Sore ike! Bokura no Chimuchimu-chan!!',
 '.hack//Roots',
 '.hack//Sign',
 '.hack//Tasogare no Udewa Densetsu',
 '.hack//Tasogare no Udewa Densetsu: Offline de Aimashou',
 '.hack//The Movie: Sekai no Mukou ni',
 '.hack//Unison',
 '.hack//Versus: The Thanatos Report',
 '001',
 '009 Re:Cyborg',
 '009-1',
 '009-1: R&amp;B',
 '00:08',
 '07-Ghost',
 '1+2=Paradise',
 '100%',
 '100-man-nen Chikyuu no Tabi: Bander Book',
 '1000-nen Joou: Queen Millennia',
 '1001 Nights',
 '11-nin Iru!',
 '11eyes',
 '11eyes Picture Drama',
 '11eyes: Momoiro Genmutan',
 '12-gatsu no Uta',
 '12-sai.

In [24]:
selected_title = input("Input Animation Title :")

idx = ani_title_list.index(selected_title)
similarity = corr[idx]

idx = 0
idx_dict = {}
for val in similarity :
    if val > 0.9 :
        idx_dict[val] = ani_title_list[idx]
    idx += 1

ani_names = []
limit = 10
idx = 0
for key, val in sorted(idx_dict.items(), reverse=True):
    if limit < idx:
        break

    if idx == 0:
        idx += 1
        continue

    ani_names.append(val)
    idx += 1

answer_table = data_ani[data_ani['name'].isin(ani_names)]
answer_table

Input Animation Title : Haikyuu!!


Unnamed: 0_level_0,name,rating,Movie,TV,OVA,Special,Music,ONA,Drama,Romance,...,Shounen Ai,Game,Dementia,Harem,Cars,Kids,Shoujo Ai,Hentai,Yaoi,Yuri
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28891,Haikyuu!! Second Season,8.93,-,o,-,-,-,-,o,-,...,-,-,-,-,-,-,-,-,-,-
24415,Kuroko no Basket 3rd Season,8.62,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
16894,Kuroko no Basket 2nd Season,8.58,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
22789,Barakamon,8.5,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
30230,Diamond no Ace: Second Season,8.5,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
11771,Kuroko no Basket,8.46,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
24277,Yowamushi Pedal: Grande Road,8.28,-,o,-,-,-,-,o,-,...,-,-,-,-,-,-,-,-,-,-
18689,Diamond no Ace,8.25,-,o,-,-,-,-,-,-,...,-,-,-,-,-,-,-,-,-,-
18179,Yowamushi Pedal,8.16,-,o,-,-,-,-,o,-,...,-,-,-,-,-,-,-,-,-,-
21185,Baby Steps,7.96,-,o,-,-,-,-,-,o,...,-,-,-,-,-,-,-,-,-,-
