In [3]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# load dataset
df= pd.read_csv("/content/anime.csv")
df

# Inspect

print("Rows, Cloumns:", df.shape)
print("\nColumns and types:")
print(df.dtypes)
print("\n Missing value counts:",df.isnull().sum())
print("\n Sample Rows:",df.head(5).T)

Rows, Cloumns: (12294, 7)

Columns and types:
anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

 Missing value counts: anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

 Sample Rows:                                              0  \
anime_id                                 32281   
name                            Kimi no Na wa.   
genre     Drama, Romance, School, Supernatural   
type                                     Movie   
episodes                                     1   
rating                                    9.37   
members                                 200630   

                                                          1  \
anime_id                                               5114   
name                       Fullmetal Alchemist: Brotherhood   
genre     Action, Adventure, Drama, Fantasy, M

In [7]:
# Data cleaning
df=df.copy()

# Genre(list of strings)
df['genre']=df['genre'].fillna('')
df['genre_list'] = df['genre'].apply(lambda x: [g.strip() for g in x.split(',')])

# Fill missing with 'unknown'
df['type'] = df['type'].fillna('Unknown')

# Convert 'unknown' to NaN and numeric strings to numbers
df['episodes'] = df['episodes'].replace('Unknown', np.nan)
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')

# rating fill missing with median
df['rating'] = df['rating'].fillna(df['rating'].median())

print("After cleaning-missing counts:")
print(df[['genre', 'type', 'episodes','rating']].isnull().sum())

After cleaning-missing counts:
genre         0
type          0
episodes    340
rating        0
dtype: int64


In [11]:
# EDA
print("Rating: min,max,mean,median:", df['rating'].min(), df['rating'].max(), df['rating'].mean(), df['rating'].median())
print("Members: min,max,mean :", df['members'].min(), df['members'].max(), df['members'].mean())

# Top genres
genre_list_all = [g for sub in df['genre_list'] for g in sub if g]
top_genres = Counter(genre_list_all).most_common(20)
print("\nTop genres (by count):")
for g,c in top_genres[:20]:
  print(f"{g}:{c}")

Rating: min,max,mean,median: 1.67 10.0 6.47569952822515 6.57
Members: min,max,mean : 5 1013917 18071.33886448674

Top genres (by count):
Comedy:4645
Action:2845
Adventure:2348
Fantasy:2309
Sci-Fi:2070
Drama:2016
Shounen:1712
Kids:1609
Romance:1464
School:1220
Slice of Life:1220
Hentai:1141
Supernatural:1037
Mecha:944
Music:860
Historical:806
Magic:778
Ecchi:637
Shoujo:603
Seinen:547


In [12]:
# Train-test split
train_df, test_df = train_test_split(df, test_size=0.20, random_state=42,shuffle=True)

# For imputation of numeric episodes we compute median on train only
episodes_median = train_df['episodes'].median()
train_df['episodes'] = train_df['episodes'].fillna(episodes_median)
test_df['episodes'] = test_df['episodes'].fillna(episodes_median)

# Fill any rating nans in test with train median
rating_median = train_df['rating'].median()
test_df['rating'] = test_df['rating'].fillna(rating_median)

print("Train rows:", len(train_df),"Test rows:", len(test_df))

Train rows: 9835 Test rows: 2459


In [13]:
# Genres -> MultiLabelBinarizer (fit on train)
mlb = MultiLabelBinarizer(sparse_output=False)
train_genres = mlb.fit_transform(train_df['genre_list'])
test_genres  = mlb.transform(test_df['genre_list'])
print("Number of unique genres (train):", len(mlb.classes_))

# Type -> one-hot using train categories
type_categories = sorted(train_df['type'].unique())
train_types = pd.get_dummies(train_df['type']).reindex(columns=type_categories, fill_value=0)
test_types  = pd.get_dummies(test_df['type']).reindex(columns=type_categories, fill_value=0)

# Numeric features + scaling
train_num = pd.DataFrame({
    'rating': train_df['rating'],
    'members_log1p': np.log1p(train_df['members']),
    'episodes': train_df['episodes']
}, index=train_df.index)

test_num = pd.DataFrame({
    'rating': test_df['rating'],
    'members_log1p': np.log1p(test_df['members']),
    'episodes': test_df['episodes']
}, index=test_df.index)

scaler = StandardScaler()
train_num_scaled = scaler.fit_transform(train_num)   # fit on train
test_num_scaled = scaler.transform(test_num)         # transform test

# Combine final feature matrices (columns: genres | types | numeric)
X_train = np.hstack([train_genres, train_types.values, train_num_scaled])
X_test  = np.hstack([test_genres, test_types.values, test_num_scaled])

print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

Number of unique genres (train): 44
X_train shape: (9835, 54) X_test shape: (2459, 54)


In [16]:
# Recommendation helpers

# Keep train rows reset for index -> metadata lookup
train_rows = train_df.reset_index(drop=True)
test_rows  = test_df.reset_index(drop=True)

'''Accepts a pandas Series with fields: ['genre_list', 'type', 'rating', 'members', 'episodes']
    Returns a single-row array matching X_train/X_test column order.'''

def build_query_vector(row):
    # genres
    gvec = mlb.transform([row['genre_list']])  # shape (1, n_genres)
    # type
    tvec = np.zeros((1, len(type_categories)), dtype=int)
    if row['type'] in type_categories:
        tvec[0, type_categories.index(row['type'])] = 1
    # numeric: rating, members_log1p, episodes
    members_log = np.log1p(row['members'])
    eps = row['episodes'] if not np.isnan(row['episodes']) else episodes_median
    num_arr = np.array([[row['rating'], members_log, eps]])
    num_scaled = scaler.transform(num_arr)
    return np.hstack([gvec, tvec, num_scaled])   # shape (1, total_features)

'''Find anime by exact match (case insensitive) or substring, build query vector,
    compute cosine similarity vs X_train, and return top_n recommendations (excluding same title).
    Returns pandas DataFrame with recommended titles and scores.'''

def recommend_by_title(title, top_n=10, allow_substring=True):
    t = title.strip().lower()
    # try exact match
    matches = df[df['name'].str.lower() == t]
    if matches.empty and allow_substring:
        matches = df[df['name'].str.lower().str.contains(t, na=False)]
    if matches.empty:
        raise ValueError(f"No anime found for title: {title}")
    row = matches.iloc[0]
    q = build_query_vector(row)                     # feature vector using train encoders
    sims = cosine_similarity(q, X_train).flatten()
    top_idx = sims.argsort()[::-1]
    recs = []
    for idx in top_idx:
        cand_name = train_rows.loc[idx, 'name']
        if cand_name.lower() == row['name'].lower():
            continue  # skip same anime
        recs.append({
            'name': cand_name,
            'score': float(sims[idx]),
            'type': train_rows.loc[idx, 'type'],
            'episodes': int(train_rows.loc[idx, 'episodes']),
            'rating': float(train_rows.loc[idx, 'rating']),
            'members': int(train_rows.loc[idx, 'members'])
        })
        if len(recs) >= top_n:
            break
    return pd.DataFrame(recs)


In [17]:
# Example usage:
print("\nExample recommendations for 'Fullmetal Alchemist: Brotherhood':")
try:
    print(recommend_by_title('Fullmetal Alchemist: Brotherhood', top_n=5).to_string(index=False))
except Exception as e:
    print("Error:", e)



Example recommendations for 'Fullmetal Alchemist: Brotherhood':
                                                 name    score type  episodes  rating  members
                         Magi: The Labyrinth of Magic 0.929606   TV        25    8.24   317513
                                          Log Horizon 0.894451   TV        25    8.14   387100
                                    Fairy Tail (2014) 0.892723   TV       102    8.25   255076
                                   Shingeki no Kyojin 0.885423   TV        25    8.54   896229
Gate: Jieitai Kanochi nite, Kaku Tatakaeri 2nd Season 0.882055   TV        12    7.97   153501


In [18]:
# Example: boosting genre weight before similarity
'''Multiply the genre columns by genre_weight (temporary) before computing similarity.
    min_similarity: if provided, only return recs with score >= min_similarity'''

def recommend_with_genre_weight(title, top_n=10, genre_weight=3.0, min_similarity=None):
    # build query vector
    t = title.strip().lower()
    matches = df[df['name'].str.lower() == t]
    if matches.empty:
        matches = df[df['name'].str.lower().str.contains(t, na=False)]
    if matches.empty:
        raise ValueError(f"No anime found for title: {title}")
    row = matches.iloc[0]
    q = build_query_vector(row)  # shape (1, total_features)

    # genre columns are first len(mlb.classes_) columns
    gcount = len(mlb.classes_)
    X_train_mod = X_train.copy()
    X_train_mod[:, :gcount] *= genre_weight
    q_mod = q.copy()
    q_mod[:, :gcount] *= genre_weight

    sims = cosine_similarity(q_mod, X_train_mod).flatten()
    top_idx = sims.argsort()[::-1]
    results = []
    for idx in top_idx:
        if min_similarity is not None and sims[idx] < min_similarity:
            continue
        cand_name = train_rows.loc[idx, 'name']
        if cand_name.lower() == row['name'].lower():
            continue
        results.append((cand_name, float(sims[idx])))
        if len(results) >= top_n:
            break
    return pd.DataFrame(results, columns=['name','score'])


In [21]:
import time

In [23]:
# Vectorized evaluation using genre-overlap as proxy
''' Vectorized evaluation:
    - Sample eval_sample_size items from X_test (or use full set if eval_sample_size is None)
    - For each sampled test item, compute top-k recommendations and compute precision/recall/F1
      against relevant training items defined by genre-overlap (≥1 common genre).
    Returns dict with average precision, recall, f1 and evaluated item count.'''

def evaluate_vectorized(k=10, eval_sample_size=1000, random_state=42):
    n_test = X_test.shape[0]
    if eval_sample_size is None or eval_sample_size >= n_test:
        idx_sample = np.arange(n_test)
    else:
        rng = np.random.RandomState(random_state)
        idx_sample = rng.choice(n_test, size=eval_sample_size, replace=False)

    X_test_sample = X_test[idx_sample]
    test_genres_sample = test_genres[idx_sample]  # matrix (n_sample, n_genres)

    # Cosine similarity matrix: (n_sample, n_train)
    start = time.time()
    sims = cosine_similarity(X_test_sample, X_train)  # expensive if sample large
    dur = time.time() - start
    print(f"Computed sims matrix for {len(idx_sample)} test items in {dur:.2f}s")

    # Get top-k (unsorted is fine for counts; use argpartition for speed)
    k = min(k, X_train.shape[0])
    topk_idxs = np.argpartition(-sims, k-1, axis=1)[:, :k]  # shape (n_sample, k)

    # Build relevant mask: (n_sample, n_train)
    # overlap_counts = test_genres_sample @ train_genres.T
    overlap_counts = test_genres_sample.dot(train_genres.T)
    relevant_mask = overlap_counts > 0  # True where share >=1 genre

    precisions = []
    recalls = []
    f1s = []
    skipped = 0
    for i, idxs in enumerate(topk_idxs):
        rel_mask_row = relevant_mask[i]
        n_rel = rel_mask_row.sum()
        if n_rel == 0:
            skipped += 1
            continue
        tp = rel_mask_row[idxs].sum()
        prec = tp / k
        rec = tp / n_rel
        f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0.0
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)

    results = {
        'k': k,
        'evaluated_items': len(precisions),
        'skipped_items_no_relevance': skipped,
        'precision': np.mean(precisions) if precisions else np.nan,
        'recall': np.mean(recalls) if recalls else np.nan,
        'f1': np.mean(f1s) if f1s else np.nan
    }
    return results

# Run a sampled evaluation (1000 items). Reduce sample size if running locally with limited time.
metrics = evaluate_vectorized(k=10, eval_sample_size=1000)
print(metrics)


Computed sims matrix for 1000 test items in 0.07s
{'k': 10, 'evaluated_items': 1000, 'skipped_items_no_relevance': 0, 'precision': np.float64(0.992), 'recall': np.float64(0.00543003771313662), 'f1': np.float64(0.010574563741893604)}


**1.Can you explain the difference between user-based and item-based collaborative filtering?**
- **User-based CF:** It finds users similar to the target user(based on retings/behavior) and recommends items that similar users likes. it's helpful when users have overlapping tastes.
suffers when user bases is large or when users have sparse history.

- **Item-based CF:** Finds items similar to the items.item-based is often more stable and scales better because the item set is usually smaller and item-to-items similarity can be precomputed.

**2.What is collaborative filtering, and how does it work?**
- **Collaborative filtering(CF):**recommends items by leveraging user-item interactions. it assumes people who agreed in the past will agree in the future.
main types are: 1. User-based
                2. Item-based
                3. Model-based