In [7]:
import pandas as pd
# Load the anime dataset
df = pd.read_csv("anime.csv")
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [9]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [13]:
# Drop rows with missing 'name' or 'rating' since they are important
df.dropna(subset=['name', 'rating'], inplace=True)

# If 'episodes' has missing values, fill with median or a placeholder like 0
df['episodes'].fillna(0, inplace=True)

# Reset index after cleanup
df.reset_index(drop=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['episodes'].fillna(0, inplace=True)


In [15]:
df.describe(), print(df['type'].value_counts())

type
TV         3671
OVA        3285
Movie      2297
Special    1671
ONA         652
Music       488
Name: count, dtype: int64


(           anime_id        rating       members
 count  12064.000000  12064.000000  1.206400e+04
 mean   13704.476044      6.473902  1.827952e+04
 std    11260.369521      1.026746  5.527578e+04
 min        1.000000      1.670000  1.200000e+01
 25%     3409.250000      5.880000  2.210000e+02
 50%    10004.000000      6.570000  1.539000e+03
 75%    23863.500000      7.180000  9.485500e+03
 max    34519.000000     10.000000  1.013917e+06,
 None)

In [17]:
# Sample genres (if available)
if 'genre' in df.columns:
    print(df['genre'].head())

0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object


In [19]:
#feature extraction
from sklearn.preprocessing import MinMaxScaler
# Make a copy to work on
df_features = df.copy()

In [20]:
# ------- 1. Handle Genres (split + one-hot encode) -------
# Split genre strings into lists
df_features['genre'] = df_features['genre'].fillna('')  # fill NaNs with empty string
df_features['genre_list'] = df_features['genre'].apply(lambda x: [g.strip() for g in x.split(',')] if x else [])

In [23]:
# Create one-hot genre features
all_genres = sorted(set(g for sublist in df_features['genre_list'] for g in sublist))
for genre in all_genres:
    df_features[genre] = df_features['genre_list'].apply(lambda x: int(genre in x))

In [25]:
 #------- 2. One-hot encode 'type' (TV, OVA, Movie, etc.) -------
df_features['type'] = df_features['type'].fillna('Unknown')
type_dummies = pd.get_dummies(df_features['type'], prefix='type')
df_features = pd.concat([df_features, type_dummies], axis=1)

In [27]:
# ------- 3. Normalize numeric features -------
scaler = MinMaxScaler()

In [29]:
# Clean episodes column: replace 'Unknown' or invalid with 0
df_features['episodes'] = pd.to_numeric(df_features['episodes'], errors='coerce').fillna(0)


In [31]:
# Select numeric columns to normalize
numeric_cols = ['rating', 'members', 'episodes']
df_features[numeric_cols] = scaler.fit_transform(df_features[numeric_cols])

In [33]:
# ------- 4. Final feature list -------
# Combine all selected features into one matrix
feature_columns = all_genres + list(type_dummies.columns) + numeric_cols
feature_matrix = df_features[feature_columns].values
print("Feature matrix shape:", feature_matrix.shape)
print("Sample row:", feature_matrix[0])

Feature matrix shape: (12064, 52)
Sample row: [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 True False False False False False 0.9243697478991597
 0.1978666640365715 0.00055005500550055]


In [35]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(feature_matrix)
def recommend_anime(anime_name, top_n=5, threshold=0.0):
    # Check if anime_name exists in the dataset
    if anime_name not in df_features['name'].values:
        raise ValueError(f"Anime '{anime_name}' not found in dataset.")

    # Get the index of the anime
    idx = df_features[df_features['name'] == anime_name].index[0]

    # Get similarity scores for the target anime
    similarity_scores = similarity_matrix[idx]

    # Get indices of anime with similarity above the threshold, excluding itself
    similar_indices = [
        i for i, score in enumerate(similarity_scores)
        if score >= threshold and i != idx
    ]

    # Sort indices by similarity score (descending)
    sorted_indices = sorted(similar_indices, key=lambda i: similarity_scores[i], reverse=True)

    # Get top N recommendations
    recommendations = [
        (df_features.iloc[i]['name'], round(similarity_scores[i], 3))
        for i in sorted_indices[:top_n]
    ]

    return recommendations

In [36]:
recommendations = recommend_anime("Naruto", top_n=5, threshold=0.3)
for title, score in recommendations:
    print(f"{title}: similarity {score}")
thresholds = [0.0, 0.2, 0.4, 0.6]
for t in thresholds:
    recs = recommend_anime("Naruto", top_n=5, threshold=t)
    print(f"\nThreshold: {t}")
    for title, score in recs:
        print(f"  - {title} ({score})")

Naruto: Shippuuden: similarity 0.997
Katekyo Hitman Reborn!: similarity 0.912
Dragon Ball Z: similarity 0.873
Bleach: similarity 0.856
Dragon Ball Kai: similarity 0.856

Threshold: 0.0
  - Naruto: Shippuuden (0.997)
  - Katekyo Hitman Reborn! (0.912)
  - Dragon Ball Z (0.873)
  - Bleach (0.856)
  - Dragon Ball Kai (0.856)

Threshold: 0.2
  - Naruto: Shippuuden (0.997)
  - Katekyo Hitman Reborn! (0.912)
  - Dragon Ball Z (0.873)
  - Bleach (0.856)
  - Dragon Ball Kai (0.856)

Threshold: 0.4
  - Naruto: Shippuuden (0.997)
  - Katekyo Hitman Reborn! (0.912)
  - Dragon Ball Z (0.873)
  - Bleach (0.856)
  - Dragon Ball Kai (0.856)

Threshold: 0.6
  - Naruto: Shippuuden (0.997)
  - Katekyo Hitman Reborn! (0.912)
  - Dragon Ball Z (0.873)
  - Bleach (0.856)
  - Dragon Ball Kai (0.856)


In [39]:
#split the data
from sklearn.model_selection import train_test_split

# Use 80% for training similarity, 20% for evaluation
train_df, test_df = train_test_split(df_features, test_size=0.2, random_state=42)

# Recompute the feature matrix and similarity on training data
train_feature_matrix = train_df[feature_columns].values
similarity_matrix_train = cosine_similarity(train_feature_matrix)


In [41]:
genre_columns = [
    'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
    'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
    'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
    'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
    'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen',
    'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power',
    'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri'
]

In [43]:
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def evaluate_recommender(test_df, train_df, similarity_matrix, top_n=5, threshold=0.3):
    genre_columns = ['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama',
    'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror',
    'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music',
    'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai',
    'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen',
    'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power',
    'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri'
]  
    precisions, recalls, f1s = [], [], []

    for _, row in test_df.iterrows():
        name = row['name']
        if name not in train_df['name'].values or row[genre_columns].sum() == 0:
            continue
        try:
            target = row[genre_columns]
            train_df['relevant'] = train_df[genre_columns].apply(lambda x: (target & x).sum() >= 2, axis=1)
            true = train_df['relevant'].astype(int).values

            idx = train_df[train_df['name'] == name].index[0]
            sim_scores = similarity_matrix[train_df.index.get_loc(idx)]
            top_idxs = sorted([i for i, s in enumerate(sim_scores) if i != idx and s >= threshold], key=lambda i: sim_scores[i], reverse=True)[:top_n]
            pred = train_df['name'].isin(train_df.iloc[top_idxs]['name']).astype(int).values

            precisions.append(precision_score(true, pred, zero_division=0))
            recalls.append(recall_score(true, pred, zero_division=0))
            f1s.append(f1_score(true, pred, zero_division=0))
        except:
            continue

    return np.mean(precisions), np.mean(recalls), np.mean(f1s) if precisions else (0.0, 0.0, 0.0)

In [45]:
precision, recall, f1 = evaluate_recommender(
    test_df=test_df,
    train_df=train_df,
    similarity_matrix=similarity_matrix_train,
    top_n=5,
    threshold=0.7
)

print(f"Precision: {precision:.2f}")
print(f"Recall:    {recall:.2f}")
print(f"F1 Score:  {f1:.2f}")

Precision: 0.20
Recall:    0.00
F1 Score:  0.01


In [47]:
for t in [0.1, 0.2, 0.3]:
    precision, recall, f1 = evaluate_recommender(
        test_df=test_df,
        train_df=train_df,
        similarity_matrix=similarity_matrix_train,
        top_n=10,
        threshold=t
    )
    print(f"Threshold: {t} → Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

Threshold: 0.1 → Precision: 0.15, Recall: 0.01, F1: 0.01
Threshold: 0.2 → Precision: 0.15, Recall: 0.01, F1: 0.01
Threshold: 0.3 → Precision: 0.15, Recall: 0.01, F1: 0.01


In [48]:
for top_n in [5, 10, 15]:
    for threshold in [0.1, 0.2, 0.3]:
        precision, recall, f1 = evaluate_recommender(
            test_df=test_df,
            train_df=train_df,
            similarity_matrix=similarity_matrix_train,
            top_n=top_n,
            threshold=threshold
        )
        print(f"Top N: {top_n}, Threshold: {threshold:.1f} → Precision: {precision:.2f}, Recall: {recall:.2f}, F1: {f1:.2f}")

Top N: 5, Threshold: 0.1 → Precision: 0.20, Recall: 0.00, F1: 0.01
Top N: 5, Threshold: 0.2 → Precision: 0.20, Recall: 0.00, F1: 0.01
Top N: 5, Threshold: 0.3 → Precision: 0.20, Recall: 0.00, F1: 0.01
Top N: 10, Threshold: 0.1 → Precision: 0.15, Recall: 0.01, F1: 0.01
Top N: 10, Threshold: 0.2 → Precision: 0.15, Recall: 0.01, F1: 0.01
Top N: 10, Threshold: 0.3 → Precision: 0.15, Recall: 0.01, F1: 0.01
Top N: 15, Threshold: 0.1 → Precision: 0.10, Recall: 0.01, F1: 0.01
Top N: 15, Threshold: 0.2 → Precision: 0.10, Recall: 0.01, F1: 0.01
Top N: 15, Threshold: 0.3 → Precision: 0.10, Recall: 0.01, F1: 0.01


In [49]:
#Interview Questions:
# 1. Can you explain the difference between user-based and item-based collaborative filtering?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")

def answer_interview_questions():
    explain_collaborative_filtering()
    

answer_interview_questions()

Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.
It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.

There are two main types:

1. User-based Collaborative Filtering:
   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).
   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.
   - Pros: Relatively simple to implement.
   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items 

In [53]:
#  2. What is collaborative filtering, and how does it work?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")

In [55]:
# p2. What is collaborative filtering, and how does it work?

def explain_collaborative_filtering():
    """Explains collaborative filtering and its types."""
    print("Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.")
    print("It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.")
    print("\nThere are two main types:")
    print("\n1. User-based Collaborative Filtering:")
    print("   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).")
    print("   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.")
    print("   - Pros: Relatively simple to implement.")
    print("   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items because many similar users have interacted with them).  Also suffers from the 'cold start' problem for new users with limited interaction history.")

    print("\n2. Item-based Collaborative Filtering:")
    print("   - Finds items similar to items a target user has liked. It identifies items that are frequently rated similarly by users.")
    print("   - Predicts ratings for new items based on how similar items have been rated.  If a user liked item A, and item B is similar to A (based on how other users rated them), the system predicts the user will also like item B.")
    print("   - Pros: More scalable compared to user-based filtering (item similarity is relatively static). Less susceptible to changes in user behavior (user profiles can change, but item similarity is more stable).")
    print("   - Cons: Might not capture diverse user preferences as well as user-based (doesn't consider individual user nuances as much). Also suffers from the 'cold start' problem for new items with limited interaction history.")

def answer_interview_questions():
    explain_collaborative_filtering()
    

answer_interview_questions()

Collaborative filtering is a recommendation technique that predicts user preferences based on the preferences of similar users or items.
It leverages the 'wisdom of the crowd' – if users A and B have similar tastes in the past, then A is likely to enjoy what B liked recently.

There are two main types:

1. User-based Collaborative Filtering:
   - Finds users with similar tastes to a target user.  The system identifies users with similar rating patterns for items they've both interacted with (e.g., movies, products).
   - Predicts ratings for items based on the ratings of similar users.  It aggregates the ratings of those similar users for items the target user hasn't yet rated, providing a prediction of how the target user might rate them.
   - Pros: Relatively simple to implement.
   - Cons: Can suffer from scalability issues as the user base grows (comparing a user to every other user becomes computationally expensive). Susceptible to popularity bias (over-recommending popular items 