In [1]:
#!pip install gensim

In [2]:
import numpy as np
import pandas as pd

In [3]:
anime_df = pd.read_csv('/kaggle/input/anime-recommendations-database/anime.csv')
rating_df = pd.read_csv('/kaggle/input/anime-recommendations-database/rating.csv')

In [4]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


#### here there are -1 ratings so filtering out these ratings first

In [6]:
rating_df = rating_df[rating_df["rating"] != -1]
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10


#### merging both

In [7]:
df = pd.merge(rating_df, anime_df, on="anime_id")

In [8]:
df.head()

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6337239 entries, 0 to 6337238
Data columns (total 9 columns):
 #   Column    Dtype  
---  ------    -----  
 0   user_id   int64  
 1   anime_id  int64  
 2   rating_x  int64  
 3   name      object 
 4   genre     object 
 5   type      object 
 6   episodes  object 
 7   rating_y  float64
 8   members   int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 435.1+ MB


#### checking upon the non values and getting rid of them

In [10]:
df.isna().sum()

user_id      0
anime_id     0
rating_x     0
name         0
genre       88
type         4
episodes     0
rating_y     5
members      0
dtype: int64

In [11]:
df = df.dropna(subset=["genre"])
df.isna().sum()

user_id     0
anime_id    0
rating_x    0
name        0
genre       0
type        4
episodes    0
rating_y    5
members     0
dtype: int64

#### making the rec sys based on genres

#### tokenizing the genre column

In [12]:
df["genre_tokens"] = df["genre"].apply(lambda x: x.split(", "))

In [13]:
genre_sentences = df["genre_tokens"].tolist()

### training the word2vec model

In [14]:
from gensim.models import Word2Vec

w2v = Word2Vec(sentences=genre_sentences, vector_size=100, window=5, min_count=1, workers=4)

#### gensim has many prebuilt functions making it easy

In [15]:
similar_genres = w2v.wv.most_similar("Action", topn=5)
print("Genres similar to 'Action':", similar_genres)

Genres similar to 'Action': [('Kids', 0.26868152618408203), ('Shounen', 0.19272707402706146), ('Romance', 0.15925388038158417), ('Music', 0.13184012472629547), ('Space', 0.10436685383319855)]


#### recommending anime names based on their genres

In [16]:
def recommend_anime_by_genre(genre):
    similar_genres = [g[0] for g in w2v.wv.most_similar(genre, topn=5)]
    recommendations = df[df["genre"].str.contains("|".join(similar_genres))]
    return recommendations[["name", "genre"]].drop_duplicates()

In [17]:
recommendations = recommend_anime_by_genre("Ecchi")
recommendations

Unnamed: 0,name,genre
19488,High School DxD,"Comedy, Demons, Ecchi, Harem, Romance, School"
31628,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance"
57938,High School DxD New,"Action, Comedy, Demons, Ecchi, Harem, Romance,..."
65665,Kuroko no Basket,"Comedy, School, Shounen, Sports"
103692,Slam Dunk,"Comedy, Drama, School, Shounen, Sports"
...,...,...
6337225,Asari-chan: Ai no Marchen Shoujo,"Adventure, Shoujo, Slice of Life"
6337226,Shiratori Reiko de Gozaimasu!,"Comedy, Romance"
6337227,Cooking Papa,"Comedy, Slice of Life"
6337234,Haha wo Tazunete Sanzenri Specials,"Adventure, Drama, Slice of Life"


#### making the same for anime names

#### see so many animes :)

In [18]:
anime_names = df["name"].unique()
anime_names[:69]

array(['Highschool of the Dead', 'High School DxD', 'Sword Art Online',
       'High School DxD New', 'Kuroko no Basket', 'Naruto', 'Shaman King',
       'Slam Dunk', 'Sen to Chihiro no Kamikakushi', 'Dragon Ball GT',
       'Spiral: Suiri no Kizuna',
       'Fullmetal Alchemist: The Conqueror of Shamballa', 'Pokemon',
       'Digimon Adventure', 'Dragon Ball Z',
       'Pokemon: Celebi Toki wo Koeta Deai',
       'Pokemon Advanced Generation: Nanayo no Negaiboshi Jirachi',
       'Pokemon Advanced Generation: Rekkuu no Houmonsha Deoxys',
       'Digimon Frontier', 'Afro Samurai', 'Digimon Adventure 02',
       'Pokemon Advanced Generation: Mew to Hadou no Yuusha Lucario',
       'Death Note', 'Pokemon Advanced Generation',
       'Byousoku 5 Centimeter', 'Slam Dunk (Movie)', 'Paprika',
       'Pokemon Advanced Generation: Pokemon Ranger to Umi no Ouji Manaphy',
       'Zombie-Loan',
       'Pokemon Diamond &amp; Pearl: Dialga vs. Palkia vs. Darkrai',
       'Soul Eater',
       'Pokem

In [19]:
df["name_tokens"] = df["name"].apply(lambda x: x.split())

name_sentences = df["name_tokens"].tolist()

#### training the word2vec model now on names

In [20]:
name_w2v = Word2Vec(sentences=name_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [23]:
similar_anime = name_w2v.wv.most_similar("Pokemon", topn=5)
print("Anime similar to 'Naruto':", similar_anime)

Anime similar to 'Naruto': [('Giratina', 0.5691414475440979), ('Zekrom', 0.5569822192192078), ('Wishes!:', 0.5281127691268921), ('Victini', 0.5249943137168884), ('Vegeta', 0.5228556394577026)]
