In [65]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [66]:
anime_df = pd.read_csv('anime.csv')

In [67]:
anime_df.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


#Handling Missing Values

In [68]:
anime_df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [69]:
anime_df.dropna(subset=['genre', 'rating'], inplace=True)

In [70]:
anime_df.fillna({'episodes': 0}, inplace=True)

#Explore the Dataset

In [71]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [72]:
anime_df.describe()

Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


In [73]:
anime_df['type'].value_counts()


Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
TV,3668
OVA,3284
Movie,2259
Special,1670
ONA,648
Music,488


In [74]:
anime_df['genre_list'] = anime_df['genre'].apply(lambda x: str(x).split(', '))


In [75]:
anime_df['genre_list']

Unnamed: 0,genre_list
0,"[Drama, Romance, School, Supernatural]"
1,"[Action, Adventure, Drama, Fantasy, Magic, Mil..."
2,"[Action, Comedy, Historical, Parody, Samurai, ..."
3,"[Sci-Fi, Thriller]"
4,"[Action, Comedy, Historical, Parody, Samurai, ..."
...,...
12289,[Hentai]
12290,[Hentai]
12291,[Hentai]
12292,[Hentai]


In [76]:
mlb = MultiLabelBinarizer()

In [77]:
mlb

In [78]:
genre_matrix = mlb.fit_transform(anime_df['genre_list'])

In [79]:
genre_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [80]:
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

In [81]:
genre_df

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12015,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Normalize Ratings and Members

In [82]:
scaler = MinMaxScaler()

In [83]:
scaler

In [84]:
anime_df[['rating', 'members']] = scaler.fit_transform(anime_df[['rating', 'members']])


In [85]:
anime_df[['rating', 'members']]

Unnamed: 0,rating,members
0,0.924370,0.197867
1,0.911164,0.782769
2,0.909964,0.112683
3,0.900360,0.664323
4,0.899160,0.149180
...,...,...
12289,0.297719,0.000196
12290,0.313325,0.000169
12291,0.385354,0.000204
12292,0.397359,0.000161


In [86]:
# Combine All Features
features = np.hstack([genre_df.values, anime_df[['rating', 'members']].values])


In [87]:
features

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.24369748e-01, 1.97866664e-01],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.11164466e-01, 7.82768603e-01],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.09963986e-01, 1.12683141e-01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.85354142e-01, 2.04161139e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.97358944e-01, 1.60764569e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.54981993e-01, 1.28217141e-04]])

#Recommendation System


In [88]:
#Compute Cosine Similarity
cos_sim = cosine_similarity(features)

In [89]:
cos_sim

array([[1.        , 0.31070343, 0.13939232, ..., 0.15027101, 0.15431821,
        0.17305982],
       [0.31070343, 1.        , 0.35855886, ..., 0.11280862, 0.11583751,
        0.12989543],
       [0.13939232, 0.35855886, 1.        , ..., 0.11686093, 0.12000966,
        0.13458606],
       ...,
       [0.15027101, 0.11280862, 0.11686093, ..., 1.        , 0.99994581,
        0.99824985],
       [0.15431821, 0.11583751, 0.12000966, ..., 0.99994581, 1.        ,
        0.99881138],
       [0.17305982, 0.12989543, 0.13458606, ..., 0.99824985, 0.99881138,
        1.        ]])

In [90]:
#Recommendation Function
def recommend_anime(title, top_n=5):
    idx = anime_df[anime_df['name'] == title].index[0]
    sim_scores = list(enumerate(cos_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    anime_indices = [i[0] for i in sim_scores]
    return anime_df['name'].iloc[anime_indices]

In [91]:
# Example
recommend_anime("Naruto", top_n=5)


Unnamed: 0,name
615,Naruto: Shippuuden
1472,Naruto: Shippuuden Movie 4 - The Lost Tower
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
486,Boruto: Naruto the Movie
1343,Naruto x UT


#Evaluation

In [92]:
# Assuming a user-anime rating matrix exists
train_data, test_data = train_test_split(anime_df, test_size=0.2, random_state=42)


In [93]:
train_data, test_data

(       anime_id                                               name  \
 909        9201  Air Gear: Kuro no Hane to Nemuri no Mori - Bre...   
 7480      32811                                        Black Ocean   
 496         416                                    Kurenai no Buta   
 9204      28965                     Kibun wa Uaa Jitsuzai OL Kouza   
 6846      31972                                  Tang Lang Bu Chan   
 ...         ...                                                ...   
 12231     13051    Bishoujo Animerama: Miyuki-chan SOS-H Shichauzo   
 5193       5917                   Tsuru ni Notte: Tomoko no Bouken   
 5392       3880                          Makyou Densetsu Acrobunch   
 860       22819                                     Aikatsu! Movie   
 7276       1252            Fushigi no Umi no Nadia: Original Movie   
 
                                                    genre   type episodes  \
 909               Action, Comedy, Ecchi, Shounen, Sports    OVA     

In [94]:
#Evaluation Metrics (Precision, Recall, F1)
# Precision, Recall, and F1-score require actual user test interactions and predictions
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    relevant_recommended = [anime for anime in recommended_k if anime in relevant_set]
    precision = len(relevant_recommended) / k
    return precision

In [104]:
# For our example:
precision_at_k(recommended_animes, user_liked_animes, 5)
# 2 relevant animes out of top 5 -> Precision = 2 / 5 = 0.40


0.4

In [95]:
def recall_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    relevant_recommended = [anime for anime in recommended_k if anime in relevant_set]
    recall = len(relevant_recommended) / len(relevant_set)
    return recall

In [102]:
#For our example:
recall_at_k(recommended_animes, user_liked_animes, 5)
# 2 out of 3 liked animes were recommended -> Recall = 2 / 3 ≈ 0.67


0.6666666666666666

In [96]:
def f1_at_k(recommended, relevant, k):
    precision = precision_at_k(recommended, relevant, k)
    recall = recall_at_k(recommended, relevant, k)
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

In [100]:
# For our example
f1_at_k(recommended_animes, user_liked_animes, 5)
# Using precision = 0.40, recall ≈ 0.67
# F1 ≈ 2 * (0.4 * 0.67) / (0.4 + 0.67) ≈ 0.5


0.5

In [97]:
# print final result

k = 5
precision = precision_at_k(recommended_animes, user_liked_animes, k)

recall = recall_at_k(recommended_animes, user_liked_animes, k)

f1 = f1_at_k(recommended_animes, user_liked_animes, k)

print(f"Precision@{k}: {precision:.2f}")
print(f"Recall@{k}: {recall:.2f}")
print(f"F1@{k}: {f1:.2f}")

Precision@5: 0.40
Recall@5: 0.67
F1@5: 0.50


# Interview Questions:

In [97]:
# Can you explain the difference between user-based and item-based collaborative filtering?

    # user-based collaborative filtering:
      #"Find people similar to you and recommend what they liked."

      #How it works:

        #Identify users who have similar preferences (using cosine similarity, Pearson correlation, etc.).

        #Recommend items that these similar users liked, which the current user hasn’t interacted with yet.


    #item-based collaborative filtering:
     #"Recommend items similar to what you already liked."

     #How it works:

       #Look at the items a user liked.

       #Find other items similar to those (based on users who liked them).

       #Recommend the most similar ones.






In [None]:
#What is collaborative filtering, and how does it work?

  #Collaborative Filtering is a recommendation method that suggests items to users based on the preferences of other users.

  #How it works:

   #It uses a user-item interaction matrix (like ratings or clicks).

   #Finds similar users (user-based) or similar items (item-based).

   #Recommends items that similar users liked or items similar to what the user liked.
