In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

# Data Preprocessing:


In [2]:
dt=pd.read_csv('anime.csv')
dt

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [3]:
len(dt.anime_id.unique())

12294

In [4]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
dt['rating'] = pd.to_numeric(dt['rating'], errors='coerce')
dt['episodes'] = pd.to_numeric(dt['episodes'], errors='coerce')

In [6]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  11954 non-null  float64
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 672.5+ KB


In [7]:
dt.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
members       0
dtype: int64

In [8]:
dt=dt.dropna(subset=['rating','members','episodes'])


In [9]:
dt.isnull().sum()

anime_id     0
name         0
genre       46
type         0
episodes     0
rating       0
members      0
dtype: int64

In [10]:
dt.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,11876.0,11876.0,11876.0,11876.0
mean,13470.844055,12.447289,6.480333,18441.46
std,11140.893582,47.011062,1.021995,55275.48
min,1.0,1.0,1.67,12.0
25%,3351.5,1.0,5.89,229.0
50%,9867.0,2.0,6.57,1572.5
75%,23407.5,12.0,7.1825,9766.25
max,34519.0,1818.0,10.0,1013917.0


In [11]:
print(dt['genre'].unique())
print(dt['type'].unique())

['Drama, Romance, School, Supernatural'
 'Action, Adventure, Drama, Fantasy, Magic, Military, Shounen'
 'Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen' ...
 'Action, Comedy, Hentai, Romance, Supernatural' 'Hentai, Sports'
 'Hentai, Slice of Life']
['Movie' 'TV' 'OVA' 'Special' 'Music' 'ONA']


# Feature Extraction:


In [12]:
df1 = pd.get_dummies(dt[["genre","type"]])

In [13]:
df2 = dt.drop(["genre","type",'name'], axis =1)

In [14]:
# Combine the numerical features with the one-hot encoded genres
features = pd.concat([df2, df1], axis=1)
features


Unnamed: 0,anime_id,episodes,rating,members,genre_Action,"genre_Action, Adventure","genre_Action, Adventure, Cars, Comedy, Sci-Fi, Shounen","genre_Action, Adventure, Cars, Mecha, Sci-Fi, Shounen, Sports","genre_Action, Adventure, Cars, Sci-Fi","genre_Action, Adventure, Comedy",...,genre_Supernatural,genre_Thriller,genre_Vampire,genre_Yaoi,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,32281,1.0,9.37,200630,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,5114,64.0,9.26,793665,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,28977,51.0,9.25,114262,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,9253,24.0,9.17,673572,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,9969,51.0,9.16,151266,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12289,9316,1.0,4.15,211,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12290,5543,1.0,4.28,183,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12291,5621,4.0,4.88,219,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
12292,6133,1.0,4.98,175,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
features[['rating', 'members']] = features[['rating', 'members']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

# Recommendation System:


In [16]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(features)

In [17]:
def recommend_anime(anime_id, dt, cosine_sim, threshold=0.5, top_n=10):
   idx=dt.index[dt['anime_id']==anime_id].tolist()[0]

   sim_scores=list(enumerate(cosine_sim[idx]))
   sim_scores =[scores for scores in sim_scores if scores[1]>=threshold]
    
  # Sort the anime based on similarity scores in descending order
   sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
   # Get the indices of the top_n similar anime
   sim_scores = sim_scores[1:top_n+1]  # Excluding the target anime itself
   anime_indices = [i[0] for i in sim_scores]
   return dt.iloc[anime_indices] 

In [18]:
recommended_anime = recommend_anime(anime_id=5114,dt=dt, cosine_sim=cosine_sim, threshold=0.5, top_n=5)
print(recommended_anime)

      anime_id                             name  \
8227      5478                  Bikkuriman 2000   
8640      4470                       Gene Diver   
5079      4154  Time Bokan Series: Yattodetaman   
3503      4086               Muka Muka Paradise   
8721      8195      Guru Guru Town Hanamaru-kun   

                                          genre type  episodes  rating  \
8227                   Adventure, Fantasy, Kids   TV      68.0    6.10   
8640                 Action, Adventure, Shounen   TV      56.0    6.30   
5079  Action, Adventure, Comedy, Fantasy, Mecha   TV      52.0    6.56   
3503                          Adventure, Comedy   TV      51.0    6.97   
8721                                       Kids   TV     101.0    6.53   

      members  
8227      153  
8640      182  
5079     1012  
3503     1165  
8721      157  


In [19]:
threshold_values = [0.1, 0.6, 0.7, 0.8, 0.9]

for threshold in threshold_values:
    print(f"Recommendations for threshold {threshold}:")
    recommended_anime = recommend_anime(anime_id=5478, dt=dt, cosine_sim=cosine_sim, threshold=threshold, top_n=5)
    print(recommended_anime[['anime_id', 'name']])
    print()

Recommendations for threshold 0.1:
       anime_id                                 name
10885     12759         Zoku Zoku Mura no Obaketachi
510       11813  Shijou Saikyou no Deshi Kenichi OVA
9735      22325                       Ningen Kakumei
9978      31078                     PikkaPika Summer
1653      25731  Cross Ange: Tenshi to Ryuu no Rondo

Recommendations for threshold 0.6:
       anime_id                                 name
10885     12759         Zoku Zoku Mura no Obaketachi
510       11813  Shijou Saikyou no Deshi Kenichi OVA
9735      22325                       Ningen Kakumei
9978      31078                     PikkaPika Summer
1653      25731  Cross Ange: Tenshi to Ryuu no Rondo

Recommendations for threshold 0.7:
       anime_id                                 name
10885     12759         Zoku Zoku Mura no Obaketachi
510       11813  Shijou Saikyou no Deshi Kenichi OVA
9735      22325                       Ningen Kakumei
9978      31078                     PikkaPika

In [22]:
train_data, test_data = train_test_split(dt, test_size=0.2, random_state=42)


# Evaluation:

In [29]:
def evaluate_recommendations(test_data, train_data, cosine_sim, top_n=10):
    y_true = []
    y_pred = []
    
    for idx, row in test_data.iterrows():
        anime_name = row['name']
        if anime_name in train_data['name'].values:
            recommended_anime = recommend_anime(anime_name, train_data, cosine_sim, top_n)
            true_anime = test_data[test_data['name'] == anime_name]['genre'].values[0].split(', ')
            for genre in true_anime:
                y_true.append(genre)
                y_pred.append(genre in recommended_anime.values)
    
    precision = precision_score(y_true, y_pred, average='weighted',zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted',zero_division=1)
    f1 = f1_score(y_true, y_pred, average='weighted',zero_division=1)
    return precision, recall, f1

precision, recall, f1 = evaluate_recommendations(test_data, train_data, cosine_sim)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Precision: 1.0
Recall: 1.0
F1-Score: 1.0


# Interview Questions: