In [1]:
# 1) Imports & Config
# ============================================
import os
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional, Dict

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix, hstack

from IPython.display import display, Markdown

pd.set_option("display.max_colwidth", 120)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
# 2) Load Dataset
# ============================================
# EDIT if needed
DATA_PATH = r"C:\Users\user\anime.csv"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Cannot find dataset at: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

# Quick schema normalization
df.columns = [c.strip().lower() for c in df.columns]
rename_map = {
    "anime_id": "anime_id",
    "name": "name",
    "genre": "genre",
    "type": "type",
    "episodes": "episodes",
    "rating": "rating",
    "members": "members",
}
# (If your file has slightly different names, add aliases here.)
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

# Minimal sanity check
required = {"name", "genre", "rating"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required column(s): {missing}. Found: {list(df.columns)}")

display(Markdown("### Preview"))
display(df.head(3))
display(Markdown(f"**Shape:** {df.shape}"))

### Preview

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Military, Shounen",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,51,9.25,114262


**Shape:** (12294, 7)

In [5]:
# 3) Preprocess
#    - Clean text fields
#    - Handle numeric conversions
# ============================================
df["name"] = df["name"].astype(str).str.strip()
df["genre"] = df["genre"].fillna("unknown").astype(str)
if "type" in df.columns:
    df["type"] = df["type"].fillna("unknown").astype(str)

# Numeric conversions
for col in ["rating", "members", "episodes"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Impute numeric
if "rating" in df.columns:
    df["rating"] = df["rating"].fillna(df["rating"].mean())
if "members" in df.columns:
    df["members"] = df["members"].fillna(df["members"].median())
if "episodes" in df.columns:
    # Some datasets have 'Unknown' -> already coerced to NaN above
    df["episodes"] = df["episodes"].fillna(df["episodes"].median())

# Report missing after cleaning
display(Markdown("### Nulls after cleaning"))
display(df.isna().sum())

### Nulls after cleaning

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [7]:
# ===============================
# 4) Feature Extraction (robust & version-safe)
#    - TF-IDF on 'genre'
#    - Scale numeric: ['rating', 'members'] if present
#    - Optional OHE on 'type'
# ===============================

# Required imports (safe to re-run)
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import csr_matrix, hstack
from IPython.display import Markdown, display

# --- Guard against missing columns / NaNs ---
if "genre" not in df.columns:
    raise ValueError("Expected column 'genre' not found in df.")
df["genre"] = df["genre"].fillna("unknown").astype(str)

# 1) TF-IDF on genre (bag-of-tags)
tfidf = TfidfVectorizer(
    analyzer="word",
    token_pattern=r"(?u)\b[\w\-]+\b",
    lowercase=True
)
genre_corpus = (
    df["genre"]
      .str.replace(",", " ", regex=False)
      .str.replace("/", " ", regex=False)
)
X_genre = tfidf.fit_transform(genre_corpus)  # csr sparse

# 2) Numeric features (scaled) — optional if present
num_feats = [c for c in ["rating", "members"] if c in df.columns]
scaler = StandardScaler()
if num_feats:
    # coerce to numeric and impute with column means for safety
    df[num_feats] = df[num_feats].apply(pd.to_numeric, errors="coerce")
    df[num_feats] = df[num_feats].fillna(df[num_feats].mean())
    X_num = csr_matrix(scaler.fit_transform(df[num_feats].to_numpy()))
else:
    X_num = csr_matrix((len(df), 0))

# 3) Categorical 'type' (optional, version-safe OHE)
if "type" in df.columns:
    df["type"] = df["type"].astype(str).fillna("unknown")
    try:
        # sklearn >= 1.2
        ohe = OneHotEncoder(handle_unknown="ignore", drop=None, sparse_output=False)
    except TypeError:
        # sklearn < 1.2
        ohe = OneHotEncoder(handle_unknown="ignore", drop=None, sparse=False)
    X_type_dense = ohe.fit_transform(df[["type"]])
    X_type = csr_matrix(X_type_dense)
else:
    X_type = csr_matrix((len(df), 0))

# 4) Final design matrix
X = hstack([X_genre, X_num, X_type], format="csr")

# 5) Useful metadata / quick output
tfidf_vocab = tfidf.vocabulary_
feature_blocks = {
    "genre_dim": X_genre.shape[1],
    "num_dim":   X_num.shape[1],
    "type_dim":  X_type.shape[1],
    "total_dim": X.shape[1],
}
display(Markdown("### Feature Blocks"))
display(pd.DataFrame([feature_blocks]))

### Feature Blocks

Unnamed: 0,genre_dim,num_dim,type_dim,total_dim
0,47,2,7,56


In [9]:
# 5) Similarity Matrix (Cosine)
# ============================================
# Cosine similarity over item content
sim = cosine_similarity(X, X)  # dense (n x n); OK for moderate datasets
np.fill_diagonal(sim, 0.0)     # zero-out self-similarity for convenience

In [11]:
# 6) Helper Indexers
# ============================================
# Build a case-insensitive title index
name_to_idx = {n.lower(): i for i, n in enumerate(df["name"].astype(str))}
idx_to_name = dict(enumerate(df["name"].astype(str)))

def _find_index_by_title(title: str) -> Optional[int]:
    """Case-insensitive lookup; returns index or None."""
    if not isinstance(title, str):
        return None
    return name_to_idx.get(title.strip().lower())

In [13]:
# 7) Recommendation Functions
# ============================================
def recommend_by_title(title: str, top_k: int = 10, include_cols: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Content-based Top-K recommendation by cosine similarity.

    Args:
        title: Item title to query.
        top_k: Number of neighbors to return.
        include_cols: Extra df columns to include in the result table.

    Returns:
        Sorted DataFrame with neighbors and similarity scores.
    """
    idx = _find_index_by_title(title)
    if idx is None:
        raise ValueError(f"Title not found: {title}")

    sims = sim[idx].copy()
    top_idx = np.argsort(-sims)[:top_k]  # largest first
    out = pd.DataFrame({
        "rank": np.arange(1, len(top_idx) + 1),
        "name": df.loc[top_idx, "name"].values,
        "similarity": sims[top_idx],
    })
    if include_cols:
        for c in include_cols:
            if c in df.columns:
                out[c] = df.loc[top_idx, c].values
    return out

def recommend_by_title_threshold(title: str, min_sim: float = 0.35, max_k: int = 100, include_cols: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Content-based threshold recommendation: return all items >= min_sim (up to max_k).

    Args:
        title: Item title to query.
        min_sim: Minimum similarity to include.
        max_k: Maximum number of results to cap output.
        include_cols: Extra df columns to include.

    Returns:
        Sorted DataFrame (desc by similarity) with neighbors above threshold.
    """
    idx = _find_index_by_title(title)
    if idx is None:
        raise ValueError(f"Title not found: {title}")

    sims = sim[idx].copy()
    cand = np.where(sims >= min_sim)[0]
    cand = cand[np.argsort(-sims[cand])]
    cand = cand[:max_k]

    out = pd.DataFrame({
        "rank": np.arange(1, len(cand) + 1),
        "name": df.loc[cand, "name"].values,
        "similarity": sims[cand],
    })
    if include_cols:
        for c in include_cols:
            if c in df.columns:
                out[c] = df.loc[cand, c].values
    return out

In [15]:
# 8) Usage Examples (safe defaults)
#    - Replace 'SAMPLE_TITLE' with any actual title from your dataset.
# ============================================
sample_title = df.loc[0, "name"]
display(Markdown(f"### Example: Top-K Recommendations for **{sample_title}**"))
display(recommend_by_title(sample_title, top_k=10, include_cols=["type", "rating", "members"]))

display(Markdown(f"### Example: Threshold Recommendations for **{sample_title}** (min_sim=0.35)"))
display(recommend_by_title_threshold(sample_title, min_sim=0.35, max_k=20, include_cols=["type", "rating", "members"]))

### Example: Top-K Recommendations for **Kimi no Na wa.**

Unnamed: 0,rank,name,similarity,type,rating,members
0,1,Hotarubi no Mori e,0.974864,Movie,8.61,197439
1,2,Suzumiya Haruhi no Shoushitsu,0.967613,Movie,8.81,240297
2,3,Hotaru no Haka,0.952794,Movie,8.58,174878
3,4,Majo no Takkyuubin,0.949556,Movie,8.27,152331
4,5,Kotonoha no Niwa,0.946789,Movie,8.35,189741
5,6,Tenkuu no Shiro Laputa,0.94493,Movie,8.38,151061
6,7,Ookami Kodomo no Ame to Yuki,0.94225,Movie,8.84,226193
7,8,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,0.938818,Movie,8.61,192424
8,9,Evangelion: 2.0 You Can (Not) Advance,0.936458,Movie,8.53,182224
9,10,Neon Genesis Evangelion: The End of Evangelion,0.936374,Movie,8.45,215630


### Example: Threshold Recommendations for **Kimi no Na wa.** (min_sim=0.35)

Unnamed: 0,rank,name,similarity,type,rating,members
0,1,Hotarubi no Mori e,0.974864,Movie,8.61,197439
1,2,Suzumiya Haruhi no Shoushitsu,0.967613,Movie,8.81,240297
2,3,Hotaru no Haka,0.952794,Movie,8.58,174878
3,4,Majo no Takkyuubin,0.949556,Movie,8.27,152331
4,5,Kotonoha no Niwa,0.946789,Movie,8.35,189741
5,6,Tenkuu no Shiro Laputa,0.94493,Movie,8.38,151061
6,7,Ookami Kodomo no Ame to Yuki,0.94225,Movie,8.84,226193
7,8,Steins;Gate Movie: Fuka Ryouiki no Déjà vu,0.938818,Movie,8.61,192424
8,9,Evangelion: 2.0 You Can (Not) Advance,0.936458,Movie,8.53,182224
9,10,Neon Genesis Evangelion: The End of Evangelion,0.936374,Movie,8.45,215630


In [17]:
# 9) Evaluation Protocol (Proxy relevance)
#    We define 'relevant' items as those sharing >= 1 genre token.
#    This is a proxy ground truth suitable for content-only datasets.
# ============================================
def parse_genre_tokens(g: str) -> set:
    # Normalize "Action, Comedy" → {"action", "comedy"}
    return set([t.strip().lower() for t in str(g).replace("/", ",").split(",") if t.strip()])

genre_sets: List[set] = [parse_genre_tokens(g) for g in df["genre"]]
n_items = len(df)

def relevant_set(i: int) -> set:
    """Items that share at least one genre with item i (exclude itself)."""
    gi = genre_sets[i]
    if not gi:
        return set()
    rel = set(j for j in range(n_items) if j != i and len(genre_sets[j] & gi) > 0)
    return rel

def precision_recall_f1(pred: List[int], truth: set) -> Tuple[float, float, float]:
    """Compute P/R/F1 for a single query."""
    if len(pred) == 0:
        return (0.0, 0.0, 0.0) if len(truth) > 0 else (1.0, 1.0, 1.0)
    pred_set = set(pred)
    tp = len(pred_set & truth)
    fp = len(pred_set - truth)
    fn = len(truth - pred_set)

    precision = 0.0 if (tp + fp) == 0 else tp / (tp + fp)
    recall = 0.0 if (tp + fn) == 0 else tp / (tp + fn)
    f1 = 0.0 if (precision + recall) == 0 else 2 * precision * recall / (precision + recall)
    return precision, recall, f1

def evaluate_topk(k: int = 10) -> Dict[str, float]:
    """Macro & micro P/R/F1 for Top-K across all items."""
    precisions, recalls, f1s = [], [], []
    micro_tp = micro_fp = micro_fn = 0

    for i in range(n_items):
        truth = relevant_set(i)
        # Skip items with no proxy ground-truth
        if len(truth) == 0:
            continue
        recs = np.argsort(-sim[i])[:k].tolist()
        p, r, f1 = precision_recall_f1(recs, truth)
        precisions.append(p); recalls.append(r); f1s.append(f1)

        # micro accumulators
        pred_set = set(recs)
        tp = len(pred_set & truth)
        fp = len(pred_set - truth)
        fn = len(truth - pred_set)
        micro_tp += tp; micro_fp += fp; micro_fn += fn

    macro = {
        "Precision_macro": np.mean(precisions) if precisions else 0.0,
        "Recall_macro": np.mean(recalls) if recalls else 0.0,
        "F1_macro": np.mean(f1s) if f1s else 0.0,
    }
    micro_precision = 0.0 if (micro_tp + micro_fp) == 0 else micro_tp / (micro_tp + micro_fp)
    micro_recall = 0.0 if (micro_tp + micro_fn) == 0 else micro_tp / (micro_tp + micro_fn)
    micro_f1 = 0.0 if (micro_precision + micro_recall) == 0 else 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
    micro = {
        "Precision_micro": micro_precision,
        "Recall_micro": micro_recall,
        "F1_micro": micro_f1,
    }
    return {**macro, **micro}

def evaluate_threshold(thresh: float = 0.35, max_k: Optional[int] = None) -> Dict[str, float]:
    """Macro & micro P/R/F1 for threshold recommender across all items."""
    precisions, recalls, f1s = [], [], []
    micro_tp = micro_fp = micro_fn = 0

    for i in range(n_items):
        truth = relevant_set(i)
        if len(truth) == 0:
            continue
        cand = np.where(sim[i] >= thresh)[0]
        cand = cand[cand != i]
        # Cap length if desired
        if max_k is not None and len(cand) > max_k:
            cand = cand[np.argsort(-sim[i, cand])[:max_k]]
        recs = cand.tolist()

        p, r, f1 = precision_recall_f1(recs, truth)
        precisions.append(p); recalls.append(r); f1s.append(f1)

        pred_set = set(recs)
        tp = len(pred_set & truth)
        fp = len(pred_set - truth)
        fn = len(truth - pred_set)
        micro_tp += tp; micro_fp += fp; micro_fn += fn

    macro = {
        "Precision_macro": np.mean(precisions) if precisions else 0.0,
        "Recall_macro": np.mean(recalls) if recalls else 0.0,
        "F1_macro": np.mean(f1s) if f1s else 0.0,
    }
    micro_precision = 0.0 if (micro_tp + micro_fp) == 0 else micro_tp / (micro_tp + micro_fp)
    micro_recall = 0.0 if (micro_tp + micro_fn) == 0 else micro_tp / (micro_tp + micro_fn)
    micro_f1 = 0.0 if (micro_precision + micro_recall) == 0 else 2 * micro_precision * micro_recall / (micro_precision + micro_recall)
    micro = {
        "Precision_micro": micro_precision,
        "Recall_micro": micro_recall,
        "F1_micro": micro_f1,
    }
    return {**macro, **micro}

In [19]:
# 10) Run Evaluations & Display Results
# ============================================
k_list = [5, 10, 20]
topk_rows = []
for k in k_list:
    res = evaluate_topk(k=k)
    res["K"] = k
    topk_rows.append(res)
topk_df = pd.DataFrame(topk_rows).set_index("K")

thresh_list = [0.25, 0.30, 0.35, 0.40]
th_rows = []
for th in thresh_list:
    res = evaluate_threshold(thresh=th)
    res["threshold"] = th
    th_rows.append(res)
thresh_df = pd.DataFrame(th_rows).set_index("threshold")

display(Markdown("## Evaluation: Top-K Recommendations"))
display(topk_df.style.format("{:.4f}"))

display(Markdown("## Evaluation: Threshold-Based Recommendations"))
display(thresh_df.style.format("{:.4f}"))

## Evaluation: Top-K Recommendations

Unnamed: 0_level_0,Precision_macro,Recall_macro,F1_macro,Precision_micro,Recall_micro,F1_micro
K,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,0.9977,0.0025,0.0048,0.9977,0.0012,0.0023
10,0.9948,0.0048,0.0094,0.9948,0.0023,0.0046
20,0.9891,0.0093,0.0176,0.9891,0.0046,0.0092


## Evaluation: Threshold-Based Recommendations

Unnamed: 0_level_0,Precision_macro,Recall_macro,F1_macro,Precision_micro,Recall_micro,F1_micro
threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.25,0.4919,0.4844,0.4345,0.4844,0.4255,0.453
0.3,0.4937,0.4282,0.4034,0.4841,0.3673,0.4177
0.35,0.4964,0.379,0.3739,0.4852,0.3173,0.3837
0.4,0.5037,0.3361,0.3472,0.4912,0.2743,0.352


In [21]:
# 11) Interpretations
# ============================================
interp_md = """
### Interpretation of Evaluation Results

**Top-K recommender**
- Increasing **K** usually **improves recall** (you retrieve more relevant items) but can **reduce precision** (you also retrieve more non-relevant items).
- Choose **K** based on product goals: smaller **K** for high precision (e.g., homepage slots), larger **K** for exploration.

**Threshold recommender**
- Higher **threshold** → fewer items recommended → higher precision, lower recall.
- Lower **threshold** → more items → higher recall, lower precision.

**Macro vs Micro**
- **Macro** averages scores per item; it treats each query equally.
- **Micro** aggregates all decisions; it gives more weight to popular items with many relevant neighbors.

> Note: Because this dataset lacks user interactions, we use a **proxy relevance**: items sharing at least one genre. This is suitable for content validation but not a substitute for user-level A/B testing on real engagement.
"""
display(Markdown(interp_md))


### Interpretation of Evaluation Results

**Top-K recommender**
- Increasing **K** usually **improves recall** (you retrieve more relevant items) but can **reduce precision** (you also retrieve more non-relevant items).
- Choose **K** based on product goals: smaller **K** for high precision (e.g., homepage slots), larger **K** for exploration.

**Threshold recommender**
- Higher **threshold** → fewer items recommended → higher precision, lower recall.
- Lower **threshold** → more items → higher recall, lower precision.

**Macro vs Micro**
- **Macro** averages scores per item; it treats each query equally.
- **Micro** aggregates all decisions; it gives more weight to popular items with many relevant neighbors.

> Note: Because this dataset lacks user interactions, we use a **proxy relevance**: items sharing at least one genre. This is suitable for content validation but not a substitute for user-level A/B testing on real engagement.


## Interview Questions — Collaborative Filtering

### 1) What is Collaborative Filtering, and how does it work?
**Collaborative Filtering (CF)** recommends items using **user–item interaction patterns** (ratings, clicks, purchases), not item content.
- **Core idea:** “Users who behaved similarly in the past will behave similarly in the future.”
- **How it works:**
  1) Build a **user–item matrix** (rows = users, columns = items, values = ratings/interactions).
  2) Compute **similarity** (between users or items) — e.g., cosine similarity or Pearson correlation.
  3) **Predict** unknown ratings by aggregating known ratings from similar users/items.
  4) Recommend **top-N** items with the highest predicted scores.

**Types of CF**
- **User-based CF:** Find users similar to the target user, then recommend what those similar users liked.
- **Item-based CF:** Find items similar to what the user liked, then recommend those items.

---

### 2) Difference between User-Based and Item-Based CF

| Aspect | User-Based CF | Item-Based CF |
|---|---|---|
| Idea | Find **similar users** to me | Find **similar items** to the ones I like |
| Similarity over | **Users** (rows) | **Items** (columns) |
| Prediction | Aggregate ratings from **similar users** for the target item | Aggregate ratings from **similar items** the user has consumed |
| Example | Alice & Bob rate movies similarly → recommend Bob’s favorites to Alice | Alice liked *Naruto* → recommend *Bleach* (co-liked by many users) |
| Scalability | Harder (user base is large & dynamic) | Easier (catalog more stable than users) |
| Typical use | Smaller, dense communities | Large e-commerce / media catalogs |

**Summary:** CF uses **behavioral signals**. User-based looks for **similar people**; item-based for **similar products**. In production, **item-based CF** is often preferred for **stability & scalability**.
