In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
animes = pd.read_csv("data/anime.csv")
ratings = pd.read_csv("data/rating.csv")

In [4]:
animes.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [10]:
ratings[ratings['rating'] == -1] ##need to drop these rows

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813628,73515,2385,-1
7813629,73515,2386,-1
7813631,73515,2490,-1
7813635,73515,2680,-1


In [19]:
##drop all NaN rating from animes and all -1 ratings from rating dataset
animes.dropna(subset = ['rating'], inplace = True)

In [21]:
ratings = ratings[ratings.rating != -1]


In [None]:
##TEST -> using just animes df, use genres and rating as feature for similarities?

In [28]:
animesTest = animes.drop(columns = ['anime_id', 'type', 'members', 'episodes'], axis = 1)

In [44]:
animesTest['genre'] = animesTest['genre'].astype('str')

In [47]:
animesTest['genre'] = animesTest['genre'].apply(lambda x: x.split(','))

In [50]:
animesTest.rename(columns = {"genre": "genres"}, inplace = True)

In [51]:
animesTest

Unnamed: 0,name,genres,rating
0,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",9.37
1,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic,...",9.26
2,Gintama°,"[Action, Comedy, Historical, Parody, Samur...",9.25
3,Steins;Gate,"[Sci-Fi, Thriller]",9.17
4,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samur...",9.16
...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,[Hentai],4.15
12290,Under World,[Hentai],4.28
12291,Violence Gekiga David no Hoshi,[Hentai],4.88
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,[Hentai],4.98


In [54]:
##count how many genres there are
from collections import Counter
genre_freq = Counter(g for genres in animesTest['genres'] for g in genres)
print(f"There are {len(genre_freq)} genres.")
##genre_freq

There are 83 genres.


In [52]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

In [55]:
##one hot encoding for all 83 genres, sparse output helps conserve memory
animesTest = animesTest.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(animesTest.pop('genres')),
                index=animesTest.index,
                columns=mlb.classes_))

In [56]:
animesTest.reset_

Unnamed: 0,name,rating,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,nan
0,Kimi no Na wa.,9.37,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,9.26,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,9.25,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,9.17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Gintama&#039;,9.16,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,Toushindai My Lover: Minami tai Mecha-Minami,4.15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12290,Under World,4.28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12291,Violence Gekiga David no Hoshi,4.88,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12292,Violence Gekiga Shin David no Hoshi: Inma Dens...,4.98,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
##Nans are for animes with no genre -> would have to manually fill that information in

In [59]:
##have to reset index as the numbers don't line up(due to earlier dropped cols)
##doing this will also allow us to access anime names by index from filter matrix later
animesTest.reset_index(drop = True, inplace = True)

In [61]:
animesTest

Unnamed: 0,name,rating,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,nan
0,Kimi no Na wa.,9.37,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Fullmetal Alchemist: Brotherhood,9.26,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Gintama°,9.25,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Steins;Gate,9.17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Gintama&#039;,9.16,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12059,Toushindai My Lover: Minami tai Mecha-Minami,4.15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12060,Under World,4.28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12061,Violence Gekiga David no Hoshi,4.88,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12062,Violence Gekiga Shin David no Hoshi: Inma Dens...,4.98,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
##can now build sparse matrix -> will have "rating" and then every genre for each anime
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(animesTest.drop(columns = ['name', 'rating'], axis = 1), animesTest.drop(columns = ['name', 'rating'], axis = 1))
print(f"Dimensions of our genres cosine similarity matrix: {cosine_sim.shape}")

Dimensions of our genres cosine similarity matrix: (12064, 12064)


In [90]:
title = "Naruto"

In [100]:
anime_idx = dict(zip(animesTest['name'], list(animesTest.index)))
idx = anime_idx["Naruto"]
print(f"Movie index for Naruto: {idx}")

Movie index for Naruto: 841


In [77]:
animesTest.iloc[841]

In [93]:
##get 10 most similar animes to naruto as test
n_recommendations=20
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
for index in sim_scores:
    if index[0] == idx:
        sim_scores.remove(index)
sim_scores[0:n_recommendations+1]

[(486, 0.9999999999999999),
 (615, 0.9999999999999999),
 (1103, 0.9999999999999999),
 (1343, 0.9999999999999999),
 (1472, 0.9999999999999999),
 (1573, 0.9999999999999999),
 (2458, 0.9999999999999999),
 (2997, 0.9999999999999999),
 (175, 0.8944271909999159),
 (7628, 0.8944271909999159),
 (7837, 0.8944271909999159),
 (206, 0.8451542547285164),
 (515, 0.8451542547285164),
 (588, 0.8451542547285164),
 (1209, 0.8451542547285164),
 (1409, 0.8451542547285164),
 (1930, 0.8451542547285164),
 (2615, 0.8451542547285164),
 (3038, 0.8451542547285164),
 (3203, 0.8451542547285164),
 (4275, 0.8451542547285164)]

In [94]:
similar_animes = [i[0] for i in sim_scores[1: (n_recommendations+1)]]
similar_animes

[615,
 1103,
 1343,
 1472,
 1573,
 2458,
 2997,
 175,
 7628,
 7837,
 206,
 515,
 588,
 1209,
 1409,
 1930,
 2615,
 3038,
 3203,
 4275]

In [95]:
print("Because you watched Naruto:")
animesTest['name'].iloc[similar_animes]

Because you watched Naruto:


615                                    Naruto: Shippuuden
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
1472          Naruto: Shippuuden Movie 4 - The Lost Tower
1573    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
7628                              Kyutai Panic Adventure!
7837                        Battle Spirits: Ryuuko no Ken
206                                         Dragon Ball Z
515                                Dragon Ball Kai (2014)
588                                       Dragon Ball Kai
1209                                  Medaka Box Abnormal
1409                Dragon Ball Z Movie 15: Fukkatsu no F
1930                                    Dragon Ball Super
2615                                           Medaka Box
3038          

In [None]:
##next two things to implement
##remove all similar shows from recommendations -> include this IN the function
##have function to do it all in one go

In [102]:
##create movie findign function with fuzzywuzzy
%pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [103]:
from fuzzywuzzy import process

def anime_finder(title):
    all_titles = animesTest['name'].tolist()
    closest_match = process.extractOne(title, all_titles)
    return closest_match[0]

In [135]:
title = anime_finder("nart")
title

'Naruto'

In [166]:
def content_based_recommendation(title_string, n_recommendations = 10, same_series = True):
    title = anime_finder(title_string)
    idx = anime_idx[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    count = 0
    iterative = 0
    sim_animes = []
    if same_series is False:
        while count < n_recommendations:
            if title not in animesTest.iloc[sim_scores[iterative][0]]['name']:
                sim_animes.append(sim_scores[iterative])
                count += 1         
            iterative += 1
    else:
        sim_animes = sim_scores[1:(n_recommendations+1)]
    ##sim_scores = sim_scores[1:(n_recommendations+1)]
    similar_animes = [i[0] for i in sim_animes]
    print(f"Because you watched {title}:")
    print(animesTest['name'].iloc[similar_animes])
    #print(sim_animes[0:50])

In [173]:
content_based_recommendation("Naruto Shippuden", 51, False)

Because you watched Naruto: Shippuuden:
486                              Boruto: Naruto the Movie
841                                                Naruto
1103    Boruto: Naruto the Movie - Naruto ga Hokage ni...
1343                                          Naruto x UT
2458                 Naruto Shippuuden: Sunny Side Battle
2997    Naruto Soyokazeden Movie: Naruto to Mashin to ...
175                                Katekyo Hitman Reborn!
7628                              Kyutai Panic Adventure!
7837                        Battle Spirits: Ryuuko no Ken
206                                         Dragon Ball Z
515                                Dragon Ball Kai (2014)
588                                       Dragon Ball Kai
1209                                  Medaka Box Abnormal
1409                Dragon Ball Z Movie 15: Fukkatsu no F
1930                                    Dragon Ball Super
2615                                           Medaka Box
3038                            

In [170]:
##Finished Content Filtering -> Now Filter Through Similar User Ratings