# Task 1: Data Preprocessing

In [1]:
import pandas as pd

In [2]:
anime_df=pd.read_csv('anime.csv')

In [5]:
anime_df.shape

(12294, 7)

In [6]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
# Step 2: Handle missing values

In [8]:
# For 'genre' and 'type', fill missing values with "Unknown"
# For 'rating', fill missing with the column mean
# Drop rows where 'name' is missing (since it's essential)
anime_df['genre'] = anime_df['genre'].fillna("Unknown")
anime_df['type'] = anime_df['type'].fillna("Unknown")
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())

In [9]:
# Step 3: Explore dataset structure

In [10]:
print("Dataset shape:", anime_df.shape)

Dataset shape: (12294, 7)


In [11]:
print("\nColumns:", anime_df.columns.tolist())


Columns: ['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members']


In [12]:
print("\nMissing values per column:\n", anime_df.isnull().sum())


Missing values per column:
 anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [13]:
print("\nData types:\n", anime_df.dtypes)


Data types:
 anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object


In [14]:
print("\nFirst 5 rows:\n", anime_df.head())


First 5 rows:
    anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  


# Task 2: Feature Extraction

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler

In [16]:
# --- Step 1: Select features for similarity ---

anime_df['genre'] = anime_df['genre'].fillna("Unknown")
anime_df['rating'] = anime_df['rating'].fillna(anime_df['rating'].mean())
anime_df['members'] = anime_df['members'].fillna(0)

In [17]:
# Step 2: Convert categorical 'genre' into numerical representation ---
# Using CountVectorizer for multi-label genres
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))
genre_matrix = vectorizer.fit_transform(anime_df['genre'])



In [18]:
# Step 3: Normalize numerical features ('rating', 'members') 
scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(anime_df[['rating', 'members']])

In [19]:
# Step 4: Combine all features into a single matrix
import numpy as np
from scipy.sparse import hstack

In [20]:
# Combine sparse genre_matrix with dense numerical features
feature_matrix = hstack([genre_matrix, numeric_features])

print("Feature matrix shape:", feature_matrix.shape)
print("Number of features extracted:", feature_matrix.shape[1])

Feature matrix shape: (12294, 46)
Number of features extracted: 46


# Task 3: Recommendation System

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack

In [22]:
# Compute cosine similarity 
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

In [23]:
cosine_sim

array([[1.        , 0.31070403, 0.13939258, ..., 0.15027155, 0.15431875,
        0.17306034],
       [0.31070403, 1.        , 0.35855939, ..., 0.11281034, 0.11583922,
        0.12989711],
       [0.13939258, 0.35855939, 1.        , ..., 0.11686118, 0.12000991,
        0.1345863 ],
       ...,
       [0.15027155, 0.11281034, 0.11686118, ..., 1.        , 0.99994581,
        0.99824985],
       [0.15431875, 0.11583922, 0.12000991, ..., 0.99994581, 1.        ,
        0.99881138],
       [0.17306034, 0.12989711, 0.1345863 , ..., 0.99824985, 0.99881138,
        1.        ]], shape=(12294, 12294))

In [24]:
# Function to recommend anime
def recommend_anime(title, top_n=10, threshold=0.3):
    # Check if anime exists
    if title not in anime_df['name'].values:
        return f"Anime '{title}' not found in dataset."
    
    # Get index of the anime
    idx = anime_df[anime_df['name'] == title].index[0]
    
    # Get similarity scores for this anime
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity score (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Filter recommendations by threshold (exclude itself at index 0)
    sim_scores = [x for x in sim_scores if x[1] >= threshold and x[0] != idx]
    
    # Get top_n recommendations
    top_indices = [i[0] for i in sim_scores[:top_n]]
    
    # Return recommended anime with similarity score
    recommendations = anime_df[['name', 'genre', 'type', 'rating']].iloc[top_indices]
    recommendations['similarity_score'] = [i[1] for i in sim_scores[:top_n]]
    
    return recommendations

In [25]:
# --- Example usage ---
print("Recommendations for 'Steins;Gate':\n")
print(recommend_anime("Steins;Gate", top_n=5, threshold=0.3))

Recommendations for 'Steins;Gate':

                                                    name  \
59            Steins;Gate Movie: Fuka Ryouiki no Déjà vu   
126                Steins;Gate: Oukoubakko no Poriomania   
196    Steins;Gate: Kyoukaimenjou no Missing Link - D...   
10898                                      Steins;Gate 0   
5126                                       Under the Dog   

                          genre     type    rating  similarity_score  
59             Sci-Fi, Thriller    Movie  8.610000          0.965280  
126            Sci-Fi, Thriller  Special  8.460000          0.959832  
196            Sci-Fi, Thriller  Special  8.340000          0.936704  
10898          Sci-Fi, Thriller  Unknown  6.473902          0.928487  
5126   Action, Sci-Fi, Thriller      OVA  6.550000          0.772375  


# Task 4: Evaluation

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [27]:
# Step 1: Train-test split
train_df, test_df = train_test_split(anime_df, test_size=0.2, random_state=42)

In [35]:
# Step 2: Evaluation function
def evaluate_recommendation_system(test_set, top_n=5, threshold=0.3):
    y_true_all = []
    y_pred_all = []
    
    for title in test_set['name'].sample(100, random_state=42):  # limit to 100 for speed
        # Get recommendations
        recs = recommend_anime(title, top_n=top_n, threshold=threshold)
        if isinstance(recs, str):  # anime not found
            continue
        
        # True genres of target anime
        target_genres = set(anime_df[anime_df['name'] == title]['genre'].values[0].split(', '))
        
        # Evaluate each recommended anime
        for _, row in recs.iterrows():
            rec_genres = set(row['genre'].split(', '))
            
            # If there is any overlap in genre, treat it as "relevant"
            relevant = int(len(target_genres & rec_genres) > 0)
            
            # Ground truth (1 = relevant, 0 = not relevant)
            y_true_all.append(1)  
            # Prediction (based on our system recommendation)
            y_pred_all.append(relevant)
    
    # Step 3: Compute metrics
    precision = precision_score(y_true_all, y_pred_all, zero_division=0)
    recall = recall_score(y_true_all, y_pred_all, zero_division=0)
    f1 = f1_score(y_true_all, y_pred_all, zero_division=0)
    
    return precision, recall, f1


In [36]:
# Example Evaluation
precision, recall, f1 = evaluate_recommendation_system(test_df, top_n=5, threshold=0.3)

In [37]:
print("Evaluation Results:")
print("Precision:", round(precision, 3))
print("Recall:", round(recall, 3))
print("F1-score:", round(f1, 3))

Evaluation Results:
Precision: 1.0
Recall: 1.0
F1-score: 1.0


# Interview questions

In [39]:
print("""
1. Difference between User-Based and Item-Based Collaborative Filtering

User-Based CF: Finds similar users and recommends items they liked.

Item-Based CF: Finds similar items and recommends them to users who liked a related item.

Key Difference: User-based focuses on user similarity, item-based on item similarity.
""")


1. Difference between User-Based and Item-Based Collaborative Filtering

User-Based CF: Finds similar users and recommends items they liked.

Item-Based CF: Finds similar items and recommends them to users who liked a related item.

Key Difference: User-based focuses on user similarity, item-based on item similarity.



In [40]:
print("""
2. What is Collaborative Filtering, and How Does It Work?

Collaborative Filtering: A method that recommends items based on patterns in user behavior.

How it works: Uses a user–item matrix and similarity measures to find either similar users or similar items, then generates recommendations accordingly.
""")


2. What is Collaborative Filtering, and How Does It Work?

Collaborative Filtering: A method that recommends items based on patterns in user behavior.

How it works: Uses a user–item matrix and similarity measures to find either similar users or similar items, then generates recommendations accordingly.

