In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
# Load dataset
df = pd.read_csv('anime.csv')

In [3]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [5]:
df.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [6]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [7]:
# Handle Missing Values
df['type'] = df['type'].fillna('Unknown')
df.dropna(subset=['anime_id', 'type', 'rating'], inplace=True)
df['genre'] = df['genre'].fillna('Unknown')

In [8]:
# Reset index to align with cosine_sim
df.reset_index(drop=True, inplace=True)

In [9]:
# Process Genres (One-Hot Encoding)
df['genre'] = df['genre'].str.split(', ')
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df['genre'])

In [10]:
# One-Hot Encode Anime Type
type_encoded = pd.get_dummies(df['type'], prefix='type')

In [11]:
# Scale Numerical Features
scaler = MinMaxScaler()
df['rating_scaled'] = scaler.fit_transform(df[['rating']])
df['community_members'] = scaler.fit_transform(df[['members']])

In [12]:
# Combine Features for Similarity Calculation
features = np.hstack((
    genres_encoded,
    type_encoded.values,
    df[['rating_scaled', 'community_members']].values
))

In [13]:
# Compute Cosine Similarity
cosine_sim = cosine_similarity(features)

In [14]:
# Function to Recommend Anime Based on Cosine Similarity
def recommend_anime(target_anime_id, cosine_sim, top_n=5):
    if target_anime_id not in df['anime_id'].values:
        return []
    
    target_idx = df.index[df['anime_id'] == target_anime_id].tolist()
    
    if not target_idx:
        return []
    
    target_idx = target_idx[0]
    
    sim_scores = list(enumerate(cosine_sim[target_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]
    
    return df.iloc[top_indices][['anime_id', 'name', 'type', 'rating', 'genre']]

In [15]:
# Split Dataset for Evaluation
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [16]:
# Generate Predictions
y_true = test['anime_id']
y_pred = [recommend_anime(anime_id, cosine_sim) for anime_id in y_true]

In [17]:
# Convert y_true and y_pred into a comparable format
y_true_set = [{anime_id} for anime_id in y_true]
y_pred_set = [set(pred['anime_id']) if isinstance(pred, pd.DataFrame) else set() for pred in y_pred]

In [18]:
# Convert to binary labels for evaluation (1 = relevant recommendation, 0 = not relevant)
y_true_binary = []
y_pred_binary = []

for true, pred in zip(y_true_set, y_pred_set):
    y_true_binary.append(1 if true.intersection(pred) else 0)
    y_pred_binary.append(1 if len(pred) > 0 else 0)

In [19]:
# Calculate Precision, Recall, and F1-score
precision = precision_score(y_true_binary, y_pred_binary, zero_division=1)
recall = recall_score(y_true_binary, y_pred_binary, zero_division=1)
f1 = f1_score(y_true_binary, y_pred_binary, zero_division=1)

In [20]:
# Display Evaluation Results
print("\nEvaluation Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Evaluation Metrics:
Precision: 0.0012
Recall: 1.0000
F1 Score: 0.0025


## Interview Questions:

### 1. Difference Between User-Based and Item-Based Collaborative Filtering**  
- User-Based Collaborative Filtering: Recommends items based on the preferences of users who have similar tastes. It finds users with similar behavior and suggests items that those similar users liked.  
- Item-Based Collaborative Filtering: Recommends items based on similarity between items rather than users. It finds relationships between items based on user interactions and suggests items that are frequently liked together.  


### 2. What is Collaborative Filtering and How Does It Work?**  
Collaborative filtering is a recommendation technique that predicts a user’s interests based on past behaviors and preferences of similar users.  
- Working:  
  - It collects user-item interactions (e.g., ratings, purchases).  
  - Identifies similarities either between **users** (user-based) or **items** (item-based).  
  - Generates personalized recommendations by suggesting items liked by similar users or similar items to those previously interacted with.  
