In [1]:
###import required dependencies
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
###Import the data
###we're using the Anime Dataset 2022 from kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"augustinkpadonou","key":"569bdd1c48e4763c45bbe708248d0527"}'}

In [3]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d vishalmane10/anime-dataset-2022

Downloading anime-dataset-2022.zip to /content
 84% 4.00M/4.74M [00:00<00:00, 12.2MB/s]
100% 4.74M/4.74M [00:00<00:00, 11.6MB/s]


In [5]:
!unzip -qq anime-dataset-2022.zip

In [63]:
###Import the csv file as a pd dataframe
anime_data = pd.read_csv('/content/Anime.csv')
#anime_data.head()

In [129]:
##let's view all columns
anime_data.columns

Index(['Rank', 'Name', 'Japanese_name', 'Type', 'Episodes', 'Studio',
       'Release_season', 'Tags', 'Rating', 'Release_year', 'End_year',
       'Voice_actors', 'staff'],
      dtype='object')

In [130]:
## We will use Name, Studio, Tags and staff as features
features_columns = ['Name', 'Studio', 'Tags', 'staff']


for feature in features_columns:
  number_of_nan = anime_data[feature].isnull().sum()
  print(f'{feature} number of nan values is : {number_of_nan} ')

Name number of nan values is : 0 
Studio number of nan values is : 6477 
Tags number of nan values is : 400 
staff number of nan values is : 5490 


In [131]:
#we can see studio column has the highest number of null values

print(f' len before process : {len(anime_data)} ')
cleaned_anime_data = anime_data[anime_data['Tags'].notna()]
cleaned_anime_data[['Studio', 'staff']] = cleaned_anime_data[['Studio', 'staff']].fillna('')
print(f' len after process  : {len(cleaned_anime_data)} ')


 len before process : 18495 
 len after process  : 18095 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [132]:
print(len(cleaned_anime_data))
for feature in features_columns:
  number_of_nan = cleaned_anime_data[feature].isnull().sum()
  print(f'{feature} number of nan values is : {number_of_nan} ')

18095
Name number of nan values is : 0 
Studio number of nan values is : 0 
Tags number of nan values is : 0 
staff number of nan values is : 0 


In [133]:
cleaned_anime_data.columns

Index(['Rank', 'Name', 'Japanese_name', 'Type', 'Episodes', 'Studio',
       'Release_season', 'Tags', 'Rating', 'Release_year', 'End_year',
       'Voice_actors', 'staff'],
      dtype='object')

In [134]:
###ok now that we get rid of nan values and cleared our data let's start building the recommandation system

In [135]:
#let's combine our features to a single column to easily encode them 

combined_anime_data = cleaned_anime_data['Name'] + ' '  + cleaned_anime_data['Studio'] + ' ' + cleaned_anime_data['Tags'] + ' ' + cleaned_anime_data['staff']

In [136]:
print(combined_anime_data)

0        Demon Slayer: Kimetsu no Yaiba - Entertainment...
1        Fruits Basket the Final Season TMS Entertainme...
2        Mo Dao Zu Shi 3 B.C MAY PICTURES Fantasy, Anci...
3        Fullmetal Alchemist: Brotherhood Bones Action,...
4        Attack on Titan 3rd Season: Part II WIT Studio...
                               ...                        
18490    Qin Shi Mingyue: Canghai Hengliu Xiaomeng Spec...
18491    Yi Tang Juchang: Sanguo Yanyi  Chinese Animation 
18492    Fenghuang Ji Xiang Yu Qingming Shanghe Tu  Chi...
18493    Chengshi Jiyi Wo Men de Jieri  Chinese Animati...
18494    Heisei Inu Monogatari Bow: Genshi Inu Monogata...
Length: 18095, dtype: object


In [137]:
###let's vectorize our data 
Vectorizer = TfidfVectorizer()
vect_anime_data = Vectorizer.fit_transform(combined_anime_data)
print(vect_anime_data)

  (0, 14523)	0.1289379617203198
  (0, 17622)	0.12001272767840608
  (0, 471)	0.2348494728482713
  (0, 3741)	0.05447934362723206
  (0, 2492)	0.0546180374220049
  (0, 11441)	0.21302073276678574
  (0, 601)	0.1402747936758737
  (0, 3850)	0.042830037105253856
  (0, 17680)	0.2348494728482713
  (0, 6454)	0.20898070176171096
  (0, 3188)	0.06705968190223686
  (0, 14080)	0.0566604935601245
  (0, 5871)	0.2658655563028191
  (0, 10068)	0.2658655563028191
  (0, 20366)	0.10773228141257171
  (0, 4656)	0.13209109273317632
  (0, 11145)	0.06827035304200656
  (0, 13859)	0.05330630687554039
  (0, 1577)	0.05351946619012797
  (0, 18231)	0.152474806679848
  (0, 17388)	0.15984376356660332
  (0, 14095)	0.17229927203210751
  (0, 1115)	0.1413688701480795
  (0, 11253)	0.14425773253648363
  (0, 6962)	0.12033677204869465
  :	:
  (18092, 829)	0.09399071203332343
  (18093, 8138)	0.4722102230616248
  (18093, 2535)	0.4722102230616248
  (18093, 8261)	0.4294140752794479
  (18093, 3576)	0.2507975931615389
  (18093, 11625)	0

In [138]:
# find cosine_similarity confidence for each row of data
#This will output for each row in our dataset the similarity score in regard to all others rows
similarity = cosine_similarity(vect_anime_data)

In [139]:
#as we aregoing to take a user input and find the highest cosine_similarity value we have to expect that user will make typos in anime name
#We will use the difflib package to match the closest anime title to the user input

list_of_titles = cleaned_anime_data.Name.tolist()
print(list_of_titles)



In [152]:
##get user input and find corresponding anime with difflib
user_input = input('Enter your anime :')

Enter your anime :berserk


In [153]:
matchs = difflib.get_close_matches(user_input, list_of_titles)
print(matchs)

['Berserk']


In [154]:
## as expected we have a list of the corresponding titles
# remark that i voluntarly made a typo in the anime name but we still get a good result
closest_match = matchs[0]

In [155]:
#We're goign to use the index of the anime to match it later with his similarity score 
#cleaned_anime_data['index'] = cleaned_anime_data.index
idx = cleaned_anime_data[cleaned_anime_data.Name == closest_match]['index'].values[0]

In [156]:
cleaned_anime_data[cleaned_anime_data.index == idx]['Name'].values[0]

'Berserk'

In [157]:
##get a similarity value for the anime with his index in regard to every other animes in our dataset

similarity_score = list(enumerate(similarity[idx]))
print(similarity_score)

[(0, 0.029900203224663784), (1, 0.044333453480249176), (2, 0.026690076158964907), (3, 0.14080542501420049), (4, 0.035678765779567245), (5, 0.06317805964905485), (6, 0.0490793231012364), (7, 0.04204594848599795), (8, 0.03213909053276597), (9, 0.19705300224008535), (10, 0.04545181120808152), (11, 0.22823103117205099), (12, 0.06258517732916444), (13, 0.1538784005840109), (14, 0.024207224391408405), (15, 0.05666877606988803), (16, 0.15294938589145066), (17, 0.02489247982330962), (18, 0.03972923442149403), (19, 0.022610074144441893), (20, 0.09512946443681526), (21, 0.04541369207451586), (22, 0.07094302794780251), (23, 0.019998344467222085), (24, 0.13745625305881232), (25, 0.1576027621987834), (26, 0.037420102477324954), (27, 0.06616385326317092), (28, 0.04439433323176505), (29, 0.014924208549443511), (30, 0.04047358387663254), (31, 0.05280208711939936), (32, 0.059614669483443086), (33, 0.06327560135620322), (34, 0.07666433010000204), (35, 0.04627448590207557), (36, 0.10372461348212195), (37

In [158]:
#remark that we have a similarity score with regard to all other anime - mangas
#We will only recommand the most similar anime to user

sorted_similarity_score = sorted(similarity_score, key = lambda x : x[1], reverse = True)

#Now let's generate recommendations for the user input
i = 1
for item in sorted_similarity_score:
	idx = item[0]
	title_from_idx = cleaned_anime_data[cleaned_anime_data.index == idx]['Name'].values
	if (i<30):
		print(title_from_idx)
		i += 1

['Berserk']
['Jormungand: Perfect Order']
['Baccano! Specials']
['Terror in Resonance']
['Sword Art Online: Alicization - War of Underworld: Part II']
['Baccano!']
['One Piece: Episode of East Blue']
['Barefoot Gen']
['Phantom: Requiem for the Phantom']
['The Slime Diaries: That Time I Got Reincarnated as a Slime']
["Natsume's Book of Friends Season 6"]
["Natsume's Book of Friends Movie: Ephemeral Bond"]
["Natsume's Book of Friends Season 5"]
["Natsume's Book of Friends Season 4"]
["Natsume's Book of Friends Season 3"]
["Natsume's Book of Friends"]
["Natsume's Book of Friends Season 2"]
['Blast of Tempest']
['My Next Life as a Villainess: All Routes Lead to Doom! X']
['HENNEKO - The Hentai Prince and the Stony Cat']
['Hotarubi no Mori e']
['Henna ABC']
['Natsume Yuujinchou: Itsuka Yuki no Hi ni']
['Honobono Log']
['Yona of the Dawn OVA']
['Code Geass: Lelouch of the Rebellion R2 Special Edition - Zero Requiem']
['Sound of the Sky']
["Monthly Girls' Nozaki-kun"]
['Oreimo 2 Specials']


In [162]:
cleaned_anime_data[cleaned_anime_data.Name == 'Berserk']['Tags'].values

array(['Action, Fantasy, Seinen, Dark Fantasy, Demons, Medieval, Mercenaries, Overpowered Main Characters, Swordplay, Based on a Manga, Explicit Sex,, Explicit Violence,, Mature Themes,, Physical Abuse,, Sexual Abuse'],
      dtype=object)

In [163]:
cleaned_anime_data[cleaned_anime_data.Name == 'Jormungand: Perfect Order']['Tags'].values

array(['Action, Seinen, Gangs, Guns, Mercenaries, Military, Based on a Manga, Nudity,, Violence'],
      dtype=object)

In [164]:
cleaned_anime_data[cleaned_anime_data.Name == 'Baccano! Specials']['Tags'].values

array(['Action, America, Criminals, Gangs, Mafia, Based on a Light Novel, Violence'],
      dtype=object)

In [None]:
## Notice that the tags for the two first recommandations for berserk have multtple tags in common:
##Action, Seinen(for the first), Violence, ... 

In [None]:
##let's test all this 

In [166]:
##get user input and find corresponding anime with difflib
user_input = input('Enter your anime :')
matchs = difflib.get_close_matches(user_input, list_of_titles)
closest_match = matchs[0]
print('\n')
print(f'Closest Match to your input :')
print('\n')
print(f'...{closest_match} ')
print('\n')
#We're goign to use the index of the anime to match it later with his similarity score 
#cleaned_anime_data['index'] = cleaned_anime_data.index
idx = cleaned_anime_data[cleaned_anime_data.Name == closest_match]['index'].values[0]
cleaned_anime_data[cleaned_anime_data.index == idx]['Name'].values[0]
##get a similarity value for the anime with his index in regard to every other animes in our dataset

similarity_score = list(enumerate(similarity[idx]))
#remark that we have a similarity score with regard to all other anime - mangas
#We will only recommand the most similar anime to user

sorted_similarity_score = sorted(similarity_score, key = lambda x : x[1], reverse = True)

#Now let's generate recommendations for the user input
i = 1
for item in sorted_similarity_score:
	idx = item[0]
	title_from_idx = cleaned_anime_data[cleaned_anime_data.index == idx]['Name'].values
	if (i<30):
		print(title_from_idx)
		i += 1

Enter your anime :bleach


Closest Match to your input :


...Bleach 


['Bleach']
['Grisaia: Phantom Trigger - Stargazer']
['Recovery of an MMO Junkie']
["That Time I Got Reincarnated as a Slime Season 2: Veldora's Journal"]
['Nogizaka Haruka no Himitsu: Purezza']
['Mole Zhuang Yuan II: Hai Yao Bao Cang']
['Naruto Movie 1: Ninja Clash in the Land of Snow']
['Fafner: The Beyond']
['D-Frag! OVA']
['Megalo Box 2: Nomad']
['Welcome to the Ballroom']
['Beautiful Bones: Sakurako’s Investigation']
['Haiyore! Nyaruko-san W']
['Hana no Zundamaru: Junk']
['Showtime! Uta no Onee-san Datte Shitai 2nd Season']
['Utopia']
['Dozens of Norths']
['Yamishibai: Japanese Ghost Stories 9th Season - The Old Well']
['Hanaori']
['Saiyuki OVA']
['Ore no Sora Keiji-hen']
['Selector Destructed WIXOSS']
['Sing "Yesterday" for Me']
['Garo The Animation']
['Death March to the Parallel World Rhapsody']
['Taiman Blues: Shimizu Naoto-hen']
['Kimagure Orange Road: I Want to Return to That Day']
['Re:ZERO -Starting Lif