In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
# Load dataset
df = pd.read_csv('anime.csv')

# Display first few rows
df.head()


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
# Dataset info
df.info()

# Check missing values
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [4]:
df['genre'] = df['genre'].fillna('')
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['episodes'] = df['episodes'].replace('Unknown', np.nan)
df['episodes'] = df['episodes'].astype(float)
df['episodes'] = df['episodes'].fillna(df['episodes'].median())


In [5]:
df['genre_list'] = df['genre'].apply(lambda x: x.split(', '))

mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df['genre_list'])

genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)


In [6]:
num_features = df[['rating', 'episodes', 'members']]

scaler = StandardScaler()
num_scaled = scaler.fit_transform(num_features)

num_df = pd.DataFrame(num_scaled, columns=num_features.columns)


In [7]:
final_features = pd.concat([genre_df, num_df], axis=1)


In [8]:
cosine_sim = cosine_similarity(final_features)


In [9]:
def recommend_anime(anime_title, top_n=5, similarity_threshold=0.3):
    if anime_title not in df['name'].values:
        return "Anime not found in dataset."

    idx = df[df['name'] == anime_title].index[0]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = []
    for i, score in sim_scores[1:]:
        if score >= similarity_threshold:
            recommendations.append((df.iloc[i]['name'], score))

        if len(recommendations) >= top_n:
            break

    return pd.DataFrame(recommendations, columns=['Anime Name', 'Similarity Score'])


In [10]:
recommend_anime("Naruto", top_n=5, similarity_threshold=0.35)


Unnamed: 0,Anime Name,Similarity Score
0,Fairy Tail,0.980505
1,Bleach,0.964356
2,Hunter x Hunter (2011),0.962942
3,D.Gray-man,0.960056
4,Soul Eater,0.944417


Interview Question

## 1. Difference between user-based and item-based collaborative filtering

User-based and item-based collaborative filtering differ in how they generate recommendations. User-based collaborative filtering focuses on finding users with similar preferences or behavior patterns and then recommends items that those similar users have liked or rated highly. For example, if two users have rated many of the same anime similarly, the system may recommend to one user the anime that the other user enjoyed. In contrast, item-based collaborative filtering focuses on finding similarities between items themselves rather than users. It recommends items that are similar to the ones a user has already liked, based on overall user interaction patterns. Item-based filtering is generally more scalable and stable than user-based filtering because item similarities change less frequently than user preferences.


## 2. What is collaborative filtering, and how does it work?

Collaborative filtering is a recommendation technique that predicts a user’s interests by analyzing the preferences and behavior of multiple users. It works on the assumption that users who had similar tastes in the past will continue to have similar tastes in the future. The system collects user–item interaction data such as ratings, likes, or views, and then identifies patterns within this data. Based on these patterns, the algorithm finds either similar users or similar items and uses this information to make recommendations. Collaborative filtering does not require detailed item descriptions, which makes it effective across many domains, but it may face challenges such as cold-start problems and data sparsity.