In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
reviews1 = pd.read_csv('animerevs.csv')
reviews2 = pd.read_csv('animerevs2.csv')
# We scraped the reviews into two files. 

In [3]:
reviews2.tail()

Unnamed: 0.1,Unnamed: 0,Animation,Character,Enjoyment,Overall,Sound,Story,anime_id,text,unkey,user
40147,40147,7,1,4,3,4,2,25283,Spoilers will be...,252832LongDidntReview,2LongDidntReview
40148,40148,2,1,1,1,2,1,25283,This anime was d...,25283MachoBro,MachoBro
40149,40149,9,9,6,7,9,3,25285,Story: not much ...,25285AnimeFan48,AnimeFan48
40150,40150,6,9,8,7,7,8,25303,If you've ever w...,25303daydreamingninja,daydreamingninja
40151,40151,9,10,10,9,7,9,25313,10 Mod Note: This...,25313BlobbertMcN,BlobbertMcN


In [4]:
reviews1.tail()

Unnamed: 0.1,Unnamed: 0,Animation,Character,Enjoyment,Overall,Sound,Story,anime_id,text,unkey,user
54571,54571,9,8,9,8,8,7,10156,Such harsh revie...,10156Catseyes,Catseyes
54572,54572,7,3,6,6,6,3,10156,I really should ...,10156Raffinate,Raffinate
54573,54573,9,6,5,6,7,6,10156,[THIS REVIEW CON...,10156ChioneDyrken,ChioneDyrken
54574,54574,5,5,3,3,3,3,10156,Honestly this fe...,10156Shaja,Shaja
54575,54575,9,6,6,7,8,5,10156,"Well, a typical ...",10156Jo-izo,Jo-izo


In [5]:
reviews2.shape

(40152, 11)

In [6]:
reviews1.shape

(54576, 11)

In [7]:
reviews = pd.concat([reviews1,reviews2]) # we need to merge our two reviews. 

In [8]:
reviews.anime_id.nunique()# Let's check and see how many anime have reviews. 

6476

In [9]:
reviews.shape

(94728, 11)

In [10]:
reviews.unkey.nunique()#let's make sure that we don't have any duplicate reviews. 

94508

In [11]:
# we only have 220 duplicate reviews

In [12]:
reviews.drop_duplicates(subset='unkey',inplace=True)

In [13]:
reviews.shape# our shape is now the number of unique keys. 

(94508, 11)

Now that we've cleaned our review data, what we want is a single text string for every distinct anime. 

In [14]:
anime_ids = reviews.anime_id.unique()

In [15]:
mergedreviewlist =[]


In [16]:
animeinfo= pd.read_csv('animeinfo3.csv')

In [17]:
anime_names = pd.read_csv('animeinfo3.csv').set_index('animeid')['name'].to_dict()

In [19]:
for num in tqdm(range(len(anime_ids))): # We go through the list of unique anime ids, then we make a single dictionary object with all of the reviews as text. 
    animerevdict = {}
    tempstr = ''
    tempdf = reviews[reviews['anime_id']==anime_ids[num]]
    #print(tempdf.shape[0])
    for revnum in range(tempdf.shape[0]):
        
        anime_name = anime_names[anime_ids[num]]
        str_name_anime_in =str(tempdf.iloc[revnum]['text']) 
        str_stripped= str_name_anime_in.replace(anime_name,'anime_name')
        
        tempstr = tempstr + str_stripped
    animerevdict['anime_id']= anime_ids[num]
    animerevdict['revtext']=tempstr
    mergedreviewlist.append(animerevdict)

100%|██████████| 6476/6476 [00:24<00:00, 263.44it/s]


In [21]:
reviews_to_vec = pd.DataFrame(mergedreviewlist)

In [None]:
reviews_to_vec.loc[2]['revtext']

In [23]:
reviews_to_vec.head()

Unnamed: 0,anime_id,revtext
0,1,ent9 People who k...
1,32772,"Okay, since most..."
2,6,ent10 anime_name ...
3,7,9 anime_name (WHR...
4,8,What can I say a...


In [24]:
stopwords = ['another','anime','the','that','and','to','genre','show','fan','thing','manga', 'season','second', 'anime_name' ]

In [25]:
tfidf = TfidfVectorizer(ngram_range=[1,3],stop_words=stopwords,)
tfidf2 = TfidfVectorizer(ngram_range=[1,3],stop_words=stopwords,binary=True)
tfidf3 = TfidfVectorizer(ngram_range=[1,3],stop_words=stopwords,max_features=900)
tfidf4 = TfidfVectorizer(ngram_range=[1,3],stop_words=stopwords,binary=True, max_features=600)# max features 600 and binary true will not produce a good output together

In [None]:

#anime_review_words4 = tfidf4.fit_transform(reviews_to_vec['revtext']) # this experiment generated a result of 1 for too many things, 

In [26]:
anime_review_words3 = tfidf3.fit_transform(reviews_to_vec['revtext'])# this will take a very long time 

In [35]:
anime_review_words2 = tfidf2.fit_transform(reviews_to_vec['revtext'])# this will take a very long time 

In [None]:
anime_review_words1 = tfidf.fit_transform(reviews_to_vec['revtext'])# this will take a very long time 

In [None]:
tempdf = reviews[reviews['anime_id']==anime_ids[num]]

In [None]:
rev1cos =cosine_similarity(anime_review_words1)
reviews_to_vec.to_csv('reviews_to_vectorize.csv')


In [28]:
animeinfo= pd.read_csv('animeinfo3.csv')
animenames=pd.merge(reviews_to_vec,animeinfo,left_on='anime_id',right_on='animeid')

In [None]:
rev1_sim_cos_df = pd.DataFrame(rev1cos, index=animenames.name.values , columns=animenames.name.values)

In [None]:
rev1_sim_cos_df.loc[:,'Death Note'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Sword Art Online'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Girls und Panzer'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Katanagatari'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'One Punch Man'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Another'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Girls und Panzer'].sort_values()[-20:]

In [None]:
rev1_sim_cos_df.loc[:,'Katanagatari'].sort_values()[-20:]

In [37]:
rev2cos =cosine_similarity(anime_review_words2)

In [38]:
rev2_sim_cos_df = pd.DataFrame(rev2cos, index=animenames.name.values , columns=animenames.name.values)

In [39]:
rev2_sim_cos_df.loc[:,'Girls und Panzer'].sort_values()[-20:]

Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.    0.032295
Guilty Crown                                                0.032316
Sakurasou no Pet na Kanojo                                  0.032325
Shinsekai yori                                              0.032412
Sword Art Online                                            0.032477
Charlotte                                                   0.032492
Fairy Tail                                                  0.032603
Hunter x Hunter (2011)                                      0.032639
Durarara!!                                                  0.033026
Another                                                     0.033264
Highschool of the Dead                                      0.033346
Steins;Gate                                                 0.033365
Kill la Kill                                                0.033432
Clannad                                                     0.033533
Akame ga Kill!                    

In [40]:
rev2_sim_cos_df.loc[:,'One Punch Man'].sort_values()[-20:]

Tokyo Ghoul                              0.049956
Charlotte                                0.050207
Neon Genesis Evangelion                  0.050209
School Days                              0.050259
Clannad                                  0.050469
Shigatsu wa Kimi no Uso                  0.050787
One Piece                                0.051518
Mahou Shoujo Madoka★Magica               0.051967
Fairy Tail                               0.052103
Guilty Crown                             0.052807
Akame ga Kill!                           0.053000
Boku dake ga Inai Machi                  0.053236
Hunter x Hunter (2011)                   0.054601
Re:Zero kara Hajimeru Isekai Seikatsu    0.054977
Death Note                               0.055053
Steins;Gate                              0.055337
Angel Beats!                             0.057018
Shingeki no Kyojin                       0.058860
Sword Art Online                         0.060257
One Punch Man                            1.000000


In [41]:
rev2_sim_cos_df.loc[:,'Bakemonogatari'].sort_values()[-20:]

Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.    0.047904
Higurashi no Naku Koro ni                                   0.048268
Neon Genesis Evangelion                                     0.048528
Shigatsu wa Kimi no Uso                                     0.049579
Guilty Crown                                                0.049614
Another                                                     0.049678
Shinsekai yori                                              0.049702
Elfen Lied                                                  0.049749
Boku dake ga Inai Machi                                     0.049751
Durarara!!                                                  0.049859
School Days                                                 0.050417
Re:Zero kara Hajimeru Isekai Seikatsu                       0.051126
Shingeki no Kyojin                                          0.051350
Death Note                                                  0.051389
Clannad                           

In [42]:
rev2_sim_cos_df.loc[:,'Sword Art Online'].sort_values()[-20:]

Shinsekai yori                           0.060771
Neon Genesis Evangelion                  0.061303
Another                                  0.061337
Clannad                                  0.061427
Hunter x Hunter (2011)                   0.062043
Charlotte                                0.062125
Elfen Lied                               0.063369
Shigatsu wa Kimi no Uso                  0.063916
Akame ga Kill!                           0.065055
Mahou Shoujo Madoka★Magica               0.065600
School Days                              0.067488
Boku dake ga Inai Machi                  0.068118
Guilty Crown                             0.068563
Death Note                               0.068746
Steins;Gate                              0.069966
Re:Zero kara Hajimeru Isekai Seikatsu    0.072313
Sword Art Online II                      0.072765
Shingeki no Kyojin                       0.073761
Angel Beats!                             0.075092
Sword Art Online                         1.000000


In [43]:
rev2_sim_cos_df.loc[:,'Katanagatari'].sort_values()[-20:]

Shigatsu wa Kimi no Uso                  0.042919
Boku dake ga Inai Machi                  0.042979
Baccano!                                 0.043056
Another                                  0.043066
One Punch Man                            0.043095
Clannad                                  0.043153
Bakemonogatari                           0.043602
Durarara!!                               0.043618
Guilty Crown                             0.044031
Re:Zero kara Hajimeru Isekai Seikatsu    0.044097
Shinsekai yori                           0.044293
Akame ga Kill!                           0.044319
Sword Art Online                         0.044468
Death Note                               0.044768
Mahou Shoujo Madoka★Magica               0.044921
Hunter x Hunter (2011)                   0.045599
Shingeki no Kyojin                       0.045768
Steins;Gate                              0.046008
Angel Beats!                             0.046753
Katanagatari                             1.000000


In [44]:
rev2_sim_cos_df.loc[:,'Saraiya Goyou'].sort_values()[-20:]

Clannad                     0.025077
Sakamichi no Apollon        0.025085
Ergo Proxy                  0.025089
Hai to Gensou no Grimgar    0.025097
Ookami to Koushinryou       0.025129
Steins;Gate                 0.025131
Zankyou no Terror           0.025165
Haibane Renmei              0.025211
Cowboy Bebop                0.025417
Death Parade                0.025490
Shinsekai yori              0.025579
Psycho-Pass                 0.025726
Baccano!                    0.025972
Hyouka                      0.026055
Monster                     0.026190
Katanagatari                0.026448
Durarara!!                  0.026919
Samurai Champloo            0.026992
Mushishi                    0.027217
Saraiya Goyou               1.000000
Name: Saraiya Goyou, dtype: float64

In [45]:
rev2_sim_cos_df.loc[:,'Haikyuu!! Second Season'].sort_values()[-20:]

ReLIFE                                                       0.029723
Steins;Gate                                                  0.029764
Fairy Tail                                                   0.029924
Ping Pong The Animation                                      0.029941
Clannad                                                      0.029970
Shigatsu wa Kimi no Uso                                      0.029981
Shingeki no Kyojin                                           0.030108
Boku no Hero Academia                                        0.030151
One Punch Man                                                0.030302
Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.     0.030391
Sakurasou no Pet na Kanojo                                   0.030414
Durarara!!                                                   0.030509
Angel Beats!                                                 0.030618
Hunter x Hunter (2011)                                       0.030887
Diamond no Ace      

In [29]:
rev3cos =cosine_similarity(anime_review_words3)
rev3_sim_cos_df = pd.DataFrame(rev3cos, index=animenames.name.values , columns=animenames.name.values)

In [30]:
rev3_sim_cos_df.loc[:,'Death Note'].sort_values()[-20:]

Neon Genesis Evangelion               0.946979
Kuroshitsuji                          0.947289
Pandora Hearts                        0.947341
Durarara!!                            0.947375
Tokyo Ghoul                           0.947389
Steins;Gate                           0.947703
Trigun                                0.947941
Guilty Crown                          0.948008
Tengen Toppa Gurren Lagann            0.948038
Deadman Wonderland                    0.948632
Ajin                                  0.948639
Higashi no Eden                       0.949293
Eureka Seven                          0.949478
Kenpuu Denki Berserk                  0.949621
Psycho-Pass                           0.950435
Shingeki no Kyojin                    0.950739
Code Geass: Hangyaku no Lelouch R2    0.956811
Monster                               0.957061
Code Geass: Hangyaku no Lelouch       0.959507
Death Note                            1.000000
Name: Death Note, dtype: float64

In [31]:
rev3_sim_cos_df.loc[:,'Girls und Panzer'].sort_values()[-20:]

Freezing                        0.950922
Black Bullet                    0.950982
Flip Flappers                   0.951903
Love Lab                        0.951959
Upotte!!                        0.952152
Strawberry Panic                0.952615
Vividred Operation              0.953387
Shirobako                       0.953632
Lucky☆Star                      0.954153
Hibike! Euphonium               0.955191
K-On!!                          0.955983
K-On!                           0.956498
Yuuki Yuuna wa Yuusha de Aru    0.957138
Highschool of the Dead          0.957623
Gakkougurashi!                  0.957865
Sakura Trick                    0.958905
Gunslinger Girl                 0.961120
Strike Witches                  0.964879
High School Fleet               0.972705
Girls und Panzer                1.000000
Name: Girls und Panzer, dtype: float64

In [32]:
rev3_sim_cos_df.loc[:,'Sword Art Online'].sort_values()[-20:]

Tokyo Ghoul √A                      0.963883
Fate/stay night                     0.964029
Shakugan no Shana                   0.964034
Deadman Wonderland                  0.964544
Phantom: Requiem for the Phantom    0.964862
Eureka Seven                        0.964936
Quanzhi Gaoshou                     0.964940
Gantz                               0.965026
Overlord                            0.965625
Koutetsujou no Kabaneri             0.965909
Charlotte                           0.966076
Shingeki no Kyojin                  0.966080
Tokyo Ghoul                         0.966615
Black Bullet                        0.967811
God Eater                           0.969963
Log Horizon                         0.971295
Guilty Crown                        0.971753
Accel World                         0.976045
Btooom!                             0.976880
Sword Art Online                    1.000000
Name: Sword Art Online, dtype: float64

In [33]:
rev3_sim_cos_df.loc[:,'Katanagatari'].sort_values()[-20:]

Bokurano                      0.972094
Shiki                         0.972279
Shinsekai yori                0.972524
Mousou Dairinin               0.972621
Claymore                      0.972764
Fate/Zero                     0.973001
Ergo Proxy                    0.973610
Tengen Toppa Gurren Lagann    0.973772
Bakemonogatari                0.973989
Eureka Seven                  0.974446
Cowboy Bebop                  0.975275
Mawaru Penguindrum            0.975449
Neon Genesis Evangelion       0.975746
Nanatsu no Taizai             0.975819
Kekkai Sensen                 0.976308
Psycho-Pass                   0.976663
Durarara!!                    0.977811
Samurai Champloo              0.978640
Shingeki no Kyojin            0.979881
Katanagatari                  1.000000
Name: Katanagatari, dtype: float64

In [34]:
rev3_sim_cos_df.loc[:,'Haikyuu!! Second Season'].sort_values()[-20:]

K                                                            0.960760
Magi: The Labyrinth of Magic                                 0.961370
Fullmetal Alchemist: Brotherhood                             0.961427
Rainbow: Nisha Rokubou no Shichinin                          0.961429
Prince of Stride: Alternative                                0.961707
Mawaru Penguindrum                                           0.961833
Phantom: Requiem for the Phantom                             0.962400
Bokurano                                                     0.963304
Fullmetal Alchemist                                          0.963534
Shiki                                                        0.963872
Kokoro Connect                                               0.963901
Angel Beats!                                                 0.964423
Ansatsu Kyoushitsu (TV) 2nd Season                           0.965014
Nanatsu no Taizai                                            0.965119
Haikyuu!!: Karasuno 

In [36]:
rev3_sim_cos_df.loc[:,'Bakemonogatari'].sort_values()[-20:]

Ghost Hunt                    0.971931
Air                           0.971942
FLCL                          0.972058
Sakurasou no Pet na Kanojo    0.973214
Mawaru Penguindrum            0.973865
Katanagatari                  0.973989
Ergo Proxy                    0.974161
Nisemonogatari                0.974669
Elfen Lied                    0.974880
Higurashi no Naku Koro ni     0.975073
Nanatsu no Taizai             0.975275
Psycho-Pass                   0.975927
Shingeki no Kyojin            0.975930
Neon Genesis Evangelion       0.976631
Ookami to Koushinryou         0.977277
Kanon (2006)                  0.977301
Shokugeki no Souma            0.977888
Clannad                       0.978652
Durarara!!                    0.978705
Bakemonogatari                1.000000
Name: Bakemonogatari, dtype: float64