In [1]:
import numpy as np
import pandas as pd

In [2]:
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [3]:
print(anime.shape)
print(rating.shape)

(12294, 7)
(7813737, 3)


In [4]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
rating['user_id'].nunique()

73515

In [7]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [8]:
rating.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [9]:
anime['rating'] = pd.to_numeric(anime['rating'], errors='coerce')  # converts to numeric(float64) type and replace the missing values with NaN
anime.dropna(subset=['rating'], inplace=True)   # dropping rows with NaN ratings
anime.reset_index(drop=True, inplace=True)  # reseting the index

In [10]:
anime['rating'].shape

(12064,)

In [11]:
anime['genre'] = anime['genre'].fillna("")
anime['genre'] = anime['genre'].str.lower()
# remove extra whitespace and split into list
anime['genre'] = anime['genre'].apply(lambda x: [g.strip() for g in x.split(',')] if x else [])

In [12]:
anime['genre'].head()

0               [drama, romance, school, supernatural]
1    [action, adventure, drama, fantasy, magic, mil...
2    [action, comedy, historical, parody, samurai, ...
3                                   [sci-fi, thriller]
4    [action, comedy, historical, parody, samurai, ...
Name: genre, dtype: object

In [13]:
anime['type'].unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

In [14]:
anime['type'] = anime['type'].fillna("Unknown")
anime['type'] = anime['type'].str.lower().str.strip()

In [15]:
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

# Polularity based Recommender System

 - We would simply sort the data according to average ratings and display top 250 book for our Popularity based system.
 - We have cleaned the anime and rating dataset, after merging them, we can apply our algorithm.

In [16]:
merge_df = anime.merge(rating, on='anime_id')

In [17]:
merge_df.shape

(7813721, 9)

In [18]:
merge_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating_x,members,user_id,rating_y
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,99,5
1,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,152,10
2,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,244,10
3,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,271,10
4,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,278,-1


In [19]:
merge_df = merge_df.rename(columns={'rating_x': 'avg_rating', 'rating_y': 'user_rating'})

In [20]:
temp_df = merge_df.groupby(['name']).count()['members'].reset_index()
temp_df = temp_df.rename(columns={'members': 'num_rating'})
temp_df

Unnamed: 0,name,num_rating
0,&quot;0&quot;,26
1,&quot;Aesop&quot; no Ohanashi yori: Ushi to Ka...,2
2,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hat...,782
3,&quot;Bungaku Shoujo&quot; Memoire,809
4,&quot;Bungaku Shoujo&quot; Movie,1535
...,...,...
11188,xxxHOLiC Kei,3413
11189,xxxHOLiC Movie: Manatsu no Yoru no Yume,2365
11190,xxxHOLiC Rou,1513
11191,xxxHOLiC Shunmuki,1974


In [21]:
avg_df = merge_df.drop(columns = ['members', 'user_id', 'user_rating'])
avg_df = avg_df.drop_duplicates(subset=['name'])
avg_df

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37
2199,5114,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,64,9.26
26773,28977,Gintama°,"[action, comedy, historical, parody, samurai, ...",tv,51,9.25
28159,9253,Steins;Gate,"[sci-fi, thriller]",tv,24,9.17
47442,9969,Gintama&#039;,"[action, comedy, historical, parody, samurai, ...",tv,51,9.16
...,...,...,...,...,...,...
7813704,9316,Toushindai My Lover: Minami tai Mecha-Minami,[hentai],ova,1,4.15
7813708,5543,Under World,[hentai],ova,1,4.28
7813712,5621,Violence Gekiga David no Hoshi,[hentai],ova,4,4.88
7813715,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[hentai],ova,1,4.98


In [22]:
avg_df = avg_df.merge(temp_df, on='name')
avg_df

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,num_rating
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,2199
1,5114,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,64,9.26,24574
2,28977,Gintama°,"[action, comedy, historical, parody, samurai, ...",tv,51,9.25,1386
3,9253,Steins;Gate,"[sci-fi, thriller]",tv,24,9.17,19283
4,9969,Gintama&#039;,"[action, comedy, historical, parody, samurai, ...",tv,51,9.16,3673
...,...,...,...,...,...,...,...
11188,9316,Toushindai My Lover: Minami tai Mecha-Minami,[hentai],ova,1,4.15,4
11189,5543,Under World,[hentai],ova,1,4.28,4
11190,5621,Violence Gekiga David no Hoshi,[hentai],ova,4,4.88,3
11191,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,[hentai],ova,1,4.98,4


In [23]:
popular_df = avg_df[avg_df['num_rating']>=250].sort_values('avg_rating', ascending=False).head(100)

In [24]:
popular_df

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,num_rating
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,2199
1,5114,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,64,9.26,24574
2,28977,Gintama°,"[action, comedy, historical, parody, samurai, ...",tv,51,9.25,1386
3,9253,Steins;Gate,"[sci-fi, thriller]",tv,24,9.17,19283
4,9969,Gintama&#039;,"[action, comedy, historical, parody, samurai, ...",tv,51,9.16,3673
...,...,...,...,...,...,...,...
97,18115,Magi: The Kingdom of Magic,"[action, adventure, fantasy, magic, shounen]",tv,25,8.50,7279
96,30230,Diamond no Ace: Second Season,"[comedy, school, shounen, sports]",tv,51,8.50,868
94,13601,Psycho-Pass,"[action, police, psychological, sci-fi]",tv,22,8.50,14008
102,6594,Katanagatari,"[action, adventure, historical, martial arts, ...",tv,12,8.49,5317


## Collabroative Filtering Recommender System

In [32]:
rating_df = rating[rating['rating']!=-1].reset_index()
rating_df = rating_df.drop(columns={"index"})
rating_df

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10
...,...,...,...
6337236,73515,16512,7
6337237,73515,17187,9
6337238,73515,22145,10
6337239,73516,790,9


#### We will count only those users which have rated only above a certain threshold number of ratings, i.e. on the basis of activeness.

In [39]:
# threshold = 250 number of ratings
x = rating.groupby(['user_id']).count()['rating']>=250
active_user_ids = x[x].index

In [44]:
rating_info = rating_df[rating_df['user_id'].isin(active_user_ids)]

In [72]:
rating_info = rating_info.reset_index().drop(columns={"index"})
rating_info

Unnamed: 0,user_id,anime_id,rating
0,5,6,8
1,5,15,6
2,5,17,6
3,5,18,6
4,5,20,6
...,...,...,...
2762254,73507,8231,5
2762255,73507,8348,5
2762256,73507,8440,7
2762257,73507,8769,8


#### Also, we will consider only those animes who have been rated for atleast a certain amount of thershold.

In [71]:
# threshold = 50 ratings on an anime
anime_rating_counts = rating.groupby('anime_id').size()
popular_anime_ids = anime_rating_counts[anime_rating_counts >= 50].index
popular_animes = anime[anime['anime_id'].isin(popular_anime_ids)]
popular_animes

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"[action, adventure, drama, fantasy, magic, mil...",tv,64,9.26,793665
2,28977,Gintama°,"[action, comedy, historical, parody, samurai, ...",tv,51,9.25,114262
3,9253,Steins;Gate,"[sci-fi, thriller]",tv,24,9.17,673572
4,9969,Gintama&#039;,"[action, comedy, historical, parody, samurai, ...",tv,51,9.16,151266
...,...,...,...,...,...,...,...
11987,6546,Pico: My Little Summer Story,[yaoi],ova,1,5.21,5551
11989,5391,Pico to Chico,"[hentai, yaoi]",ova,1,5.19,29463
11992,3635,Advancer Tina,"[hentai, sci-fi, space]",ova,1,5.16,1487
11993,4866,Pico x CoCo x Chico,"[hentai, yaoi]",ova,1,5.16,27411


 - 7591 users have given more than threshold number of ratings.
 - 5651 animes have recieved ratings by more than threshold number of users.

In [75]:
anime_df = popular_animes.merge(rating_info, on='anime_id')
anime_df = anime_df.rename(columns={"rating_x": "avg_rating", "rating_y": "user_rating"})
anime_df

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,members,user_id,user_rating
0,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,244,10
1,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,271,10
2,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,462,8
3,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,784,9
4,32281,Kimi no Na wa.,"[drama, romance, school, supernatural]",movie,1,9.37,200630,786,10
...,...,...,...,...,...,...,...,...,...
2728349,1639,Boku no Pico,"[hentai, yaoi]",ova,1,5.13,57355,73135,2
2728350,1639,Boku no Pico,"[hentai, yaoi]",ova,1,5.13,57355,73162,10
2728351,1639,Boku no Pico,"[hentai, yaoi]",ova,1,5.13,57355,73328,2
2728352,1639,Boku no Pico,"[hentai, yaoi]",ova,1,5.13,57355,73329,6


In [77]:
table_df = anime_df.pivot_table(index="name", columns="user_id", values="user_rating")

In [82]:
table_df.fillna(0, inplace=True)

In [83]:
table_df

user_id,5,7,17,38,43,46,123,139,160,198,...,73406,73408,73417,73422,73457,73476,73491,73499,73502,73507
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0
.hack//G.U. Returner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Trilogy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,2.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,8.0,0.0,...,10.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,10.0,10.0
xxxHOLiC Kei,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,8.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,9.0
xxxHOLiC Movie: Manatsu no Yoru no Yume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,9.0
xxxHOLiC Rou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


 - `0`: not rated
 - `[1, 10]`: rating give by users to animes

In [85]:
from sklearn.metrics.pairwise import cosine_similarity

In [89]:
cos_sim_matrix = cosine_similarity(table_df.values)
cos_sim_df = pd.DataFrame(cos_sim_matrix, index=table_df.index, columns=table_df.index)

In [92]:
cos_sim_matrix.shape

(5651, 5651)

In [90]:
cos_sim_df

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,1.000000,0.714473,0.600464,0.112421,0.124191,0.096794,0.111659,0.117260,0.121106,0.152381,...,0.287564,0.259599,0.189008,0.127990,0.078058,0.213716,0.194587,0.208980,0.186988,0.207572
&quot;Bungaku Shoujo&quot; Memoire,0.714473,1.000000,0.693933,0.116556,0.129184,0.097077,0.107346,0.119599,0.122896,0.140153,...,0.292886,0.235784,0.180317,0.125370,0.090451,0.209256,0.197884,0.192762,0.199059,0.204893
&quot;Bungaku Shoujo&quot; Movie,0.600464,0.693933,1.000000,0.119488,0.138284,0.105241,0.100445,0.120592,0.121001,0.145927,...,0.356924,0.230178,0.156416,0.123036,0.110009,0.228879,0.211343,0.196500,0.201723,0.205417
.hack//G.U. Returner,0.112421,0.116556,0.119488,1.000000,0.702022,0.607083,0.616287,0.581958,0.555288,0.462214,...,0.183452,0.141754,0.115159,0.069591,0.111401,0.174164,0.173973,0.176642,0.144603,0.157140
.hack//G.U. Trilogy,0.124191,0.129184,0.138284,0.702022,1.000000,0.559520,0.508921,0.486578,0.530734,0.477772,...,0.191803,0.131747,0.097020,0.105485,0.164752,0.203232,0.187575,0.183660,0.144203,0.159959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0.213716,0.209256,0.228879,0.174164,0.203232,0.123275,0.175262,0.201100,0.224763,0.201171,...,0.362995,0.171177,0.111558,0.149044,0.243393,1.000000,0.858157,0.679840,0.631906,0.686436
xxxHOLiC Kei,0.194587,0.197884,0.211343,0.173973,0.187575,0.121789,0.156764,0.185137,0.196142,0.169994,...,0.333315,0.161631,0.108802,0.131600,0.198827,0.858157,1.000000,0.687903,0.720312,0.784527
xxxHOLiC Movie: Manatsu no Yoru no Yume,0.208980,0.192762,0.196500,0.176642,0.183660,0.128790,0.183311,0.207260,0.195678,0.172197,...,0.278948,0.178285,0.125436,0.122538,0.177028,0.679840,0.687903,1.000000,0.615022,0.684437
xxxHOLiC Rou,0.186988,0.199059,0.201723,0.144603,0.144203,0.109326,0.112494,0.141131,0.119387,0.151276,...,0.242704,0.158698,0.107981,0.079149,0.120910,0.631906,0.720312,0.615022,1.000000,0.825829


In [98]:
from difflib import get_close_matches

def recommend(anime_name, top_n=10):
    if anime_name not in cos_sim_df.index:
        close_match = get_close_matches(anime_name, cos_sim_df.index, n=1)
        if close_match:
            anime_name = close_match[0]
        else:
            return f"'{anime_name}' not found in anime list."
    
    similar_animes = cos_sim_df[anime_name].sort_values(ascending=False).iloc[1:top_n+1]
    return pd.DataFrame({
        'Recommended Anime': similar_animes.index,
        'Similarity Score': similar_animes.values
    })


In [103]:
recommend("death note", 20)

Unnamed: 0,Recommended Anime,Similarity Score
0,Code Geass: Hangyaku no Lelouch,0.808323
1,Code Geass: Hangyaku no Lelouch R2,0.792399
2,Elfen Lied,0.762357
3,Fullmetal Alchemist: Brotherhood,0.758946
4,Angel Beats!,0.754108
5,Shingeki no Kyojin,0.751726
6,Toradora!,0.743203
7,Sword Art Online,0.73373
8,Fullmetal Alchemist,0.732552
9,Steins;Gate,0.728813


In [104]:
import pickle

In [106]:
with open('pivot_table.pkl', 'wb') as f:
    pickle.dump(table_df, f)

In [107]:
with open('cos_sim_df.pkl', 'wb') as f:
    pickle.dump(cos_sim_df, f)

In [108]:
import html

# Apply unescape to 'name' and optionally 'genre'
popular_df['name'] = popular_df['name'].apply(html.unescape)
popular_df['genre'] = popular_df['genre'].apply(html.unescape)

# Save again after cleaning
popular_df.to_csv("top_100_df.csv", index=False)