**Level 2**
**Task 5: Movie Recommendation System Description**

Description:


*   Dataset (Recommended): MovieLens 100K Dataset (Kaggle).
*   Build a system that recommends movies based on user similarity.
*   Use a user-item matrix to compute similarity scores.
*   Recommend top-rated unseen movies for a given user.
*   Evaluate performance using precision at K.






Tools & Libraries:


*   Python
*   Pandas
*   Numpy
*   Scikit-Learn


Covered Topics:


*   Recommendation systems
*   Similarity-based modeling






Bonus:


*   Implement item-based collaborative filtering.
*   Try matrix factorization (SVD).



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD

In [None]:
# =====  user–item matrix and similarity  =====

# 1) Load ratings (MovieLens 100K: u.data has user_id, item_id, rating, timestamp)
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "item_id", "rating", "timestamp"]
).drop(columns=["timestamp"])

# 2) Build the user–item matrix (rows = users, cols = items)
UI = ratings.pivot_table(index="user_id", columns="item_id", values="rating")

# ----- Mean-center by user to reduce user bias -----
UI_centered = UI.sub(UI.mean(axis=1), axis=0)

# 3) Replace NaNs with zeros (cosine similarity needs numbers)
UI_filled = UI.fillna(0.0).astype(np.float32)
UIc_filled = UI_centered.fillna(0.0).astype(np.float32)

# 4) USER–USER cosine similarity
user_sim = pd.DataFrame(
    cosine_similarity(UIc_filled.values),
    index=UI.index, columns=UI.index
)

# 5) ITEM–ITEM cosine similarity
item_sim = pd.DataFrame(
    cosine_similarity(UI_filled.T.values),
    index=UI.columns, columns=UI.columns
)

# ---------- Helpers to inspect/top-N ----------
def top_similar_users(user_id, k=10, sim_matrix=user_sim):
    sims = sim_matrix.loc[user_id].drop(user_id)  # exclude self
    return sims.sort_values(ascending=False).head(k)

def top_similar_items(item_id, k=10, sim_matrix=item_sim):
    sims = sim_matrix.loc[item_id].drop(item_id)
    return sims.sort_values(ascending=False).head(k)

# Examples:
print("\nTop similar users to user 10:")
print(top_similar_users(10, k=5))

print("\nTop similar items to item 50:")
print(top_similar_items(50, k=5))



Top similar users to user 10:
user_id
321    0.209248
313    0.206029
710    0.201272
293    0.190827
322    0.178730
Name: 10, dtype: float32

Top similar items to item 50:
item_id
181    0.884476
174    0.764885
172    0.749819
1      0.734572
127    0.697332
Name: 50, dtype: float32


In [None]:
# ===== User-based CF Recommender =====

# 1) Load ratings (u.data: user_id, item_id, rating, timestamp)
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)[["user_id", "movie_id", "rating"]]

# Load movie titles (u.item)
# u.item fields: movie_id|title|release_date|... (pipe-separated)
movies = pd.read_csv(
    "u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movie_id", "title"]
)

# 2) Build user–item rating matrix
UI = ratings.pivot_table(index="user_id", columns="movie_id", values="rating")

# 3) Mean-center by user (reduces user bias)
user_means = UI.mean(axis=1)
UI_centered = UI.sub(user_means, axis=0)

# 4) Compute USER–USER cosine similarity on centered data (fill NaNs with 0)
UIc_filled = UI_centered.fillna(0.0).astype(np.float32)
user_ids = UIc_filled.index.to_numpy()
user_sim = cosine_similarity(UIc_filled.values)  # shape: n_users x n_users
user_sim = pd.DataFrame(user_sim, index=user_ids, columns=user_ids)

# --- Helper: find k nearest neighbors with at least min_overlap co-rated items ---
def _nearest_neighbors(target_uid, k=30, min_overlap=5):
    # Overlap: count of movies both target and candidate rated
    target_rated = UI.loc[target_uid].notna()
    overlap = UI.notna().dot(target_rated.astype(int))  # fast overlap counts per user
    sims = user_sim.loc[target_uid].drop(index=target_uid)  # exclude self
    # filter by minimum overlap and positive similarity
    mask = (overlap.drop(index=target_uid) >= min_overlap) & (sims > 0)
    sims = sims[mask]
    # top-k neighbors
    return sims.sort_values(ascending=False).head(k)

# --- Core prediction: weighted sum of neighbor deviations + user mean ---
def predict_user_scores(target_uid, k=30, min_overlap=5, shrink=10.0):
    """
    Returns a pandas Series of predicted ratings for all movies for target_uid.
    Uses mean-centered neighbor ratings with cosine similarities and a shrinkage term.
    """
    if target_uid not in UI.index:
        raise ValueError(f"user_id {target_uid} not found.")

    neighbors = _nearest_neighbors(target_uid, k=k, min_overlap=min_overlap)
    if neighbors.empty:
        # Fallback: everyone’s global mean for unseen movies
        global_mean = ratings["rating"].mean()
        return pd.Series(global_mean, index=UI.columns)

    # Build neighbor-centered matrix restricted to neighbors
    N = neighbors.index
    sims = neighbors.values  # similarity weights
    # deviations from each neighbor's mean
    neigh_means = user_means.loc[N]
    neigh_centered = UI.loc[N].sub(neigh_means, axis=0)

    # Weighted sum of neighbor deviations
    # numerator[j] = sum_i sim_i * dev_{i,j}; denominator[j] = sum_i |sim_i|
    dev = neigh_centered  # can contain NaN
    sim_vec = pd.Series(sims, index=N)

    # Use matrix multiplication with NaN-safe handling
    dev_filled = dev.fillna(0.0).to_numpy(dtype=np.float32)                 # n_neighbors x n_items
    weights = sim_vec.to_numpy(dtype=np.float32)[:, None]                   # n_neighbors x 1
    num = (weights * dev_filled).sum(axis=0)                                # n_items

    # Denominator: sum of |sim| over neighbors who rated the item
    rated_mask = (~dev.isna()).to_numpy(dtype=np.float32)                   # 1 if neighbor rated
    den = (np.abs(weights) * rated_mask).sum(axis=0) + shrink               # add shrinkage

    # Predicted deviations
    pred_dev = num / den

    # Add back target user's mean
    base = float(user_means.loc[target_uid])
    preds = pd.Series(base + pred_dev, index=UI.columns)

    return preds

# --- Recommend top-N unseen movies for a user ---
def recommend_for_user(target_uid, top_n=10, k=30, min_overlap=5, shrink=10.0):
    preds = predict_user_scores(target_uid, k=k, min_overlap=min_overlap, shrink=shrink)

    # Exclude already-rated movies
    already_rated = UI.loc[target_uid].dropna().index
    preds = preds.drop(index=already_rated, errors="ignore")

    # Clip to rating range (MovieLens 100K is 1..5)
    preds = preds.clip(lower=1.0, upper=5.0)

    top = preds.sort_values(ascending=False).head(top_n).reset_index()
    top.columns = ["movie_id", "pred_rating"]
    if "title" in movies.columns:
        top = top.merge(movies, on="movie_id", how="left")[["movie_id", "title", "pred_rating"]]
    return top

# ===== Example =====
# Choose any existing user_id from UI.index, e.g., 100
target_user = int(UI.index[0])  # or specify e.g. 100
recs = recommend_for_user(target_user, top_n=10, k=40, min_overlap=5, shrink=15.0)

print(f"\nTop recommendations for user {target_user}:")
print(recs)



Top recommendations for user 1:
   movie_id                                              title  pred_rating
0       318                            Schindler's List (1993)     3.837789
1       483                                  Casablanca (1942)     3.827525
2       651                                       Glory (1989)     3.821201
3       474  Dr. Strangelove or: How I Learned to Stop Worr...     3.791263
4       408                              Close Shave, A (1995)     3.790018
5       357             One Flew Over the Cuckoo's Nest (1975)     3.782414
6       603                                 Rear Window (1954)     3.774031
7       302                           L.A. Confidential (1997)     3.762339
8       484                         Maltese Falcon, The (1941)     3.762025
9       435          Butch Cassidy and the Sundance Kid (1969)     3.759227


In [None]:
# --- 1) Load ratings and movie titles ---
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)[["user_id", "movie_id", "rating"]]

movies = pd.read_csv(
    "u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movie_id", "title"]
)

# --- 2) Build user–item matrix + mean-centering ---
UI = ratings.pivot_table(index="user_id", columns="movie_id", values="rating")
user_means = UI.mean(axis=1)
UI_centered = UI.sub(user_means, axis=0)

# Precompute cosine similarity between users on centered matrix
UIc_filled = UI_centered.fillna(0.0).astype(np.float32)
user_sim = pd.DataFrame(
    cosine_similarity(UIc_filled.values),
    index=UI.index, columns=UI.index
)

# --- 3) Helpers: neighbors, predictions, recommendations ---
def _nearest_neighbors(target_uid, k=30, min_overlap=5):
    """Return top-k neighbors with positive sim and at least min_overlap co-rated items."""
    if target_uid not in UI.index:
        raise ValueError(f"user_id {target_uid} not found.")
    target_rated = UI.loc[target_uid].notna()
    overlap = UI.notna().dot(target_rated.astype(int))              # users × items → counts
    sims = user_sim.loc[target_uid].drop(index=target_uid)          # exclude self
    mask = (overlap.drop(index=target_uid) >= min_overlap) & (sims > 0)
    return sims[mask].sort_values(ascending=False).head(k)

def _predict_all_items_for_user(target_uid, k=30, min_overlap=5, shrink=10.0):
    """Predict ratings for all items via weighted neighbor deviations + user mean."""
    nbrs = _nearest_neighbors(target_uid, k=k, min_overlap=min_overlap)
    if nbrs.empty:
        # fallback: global mean for all items
        return pd.Series(ratings["rating"].mean(), index=UI.columns)

    N = nbrs.index
    w = nbrs.values.astype(np.float32)[:, None]                     # n_neighbors × 1
    neigh_means = user_means.loc[N]
    dev = UI.loc[N].sub(neigh_means, axis=0)                        # neighbor-centered

    dev_filled = dev.fillna(0.0).to_numpy(dtype=np.float32)         # n_neighbors × n_items
    num = (w * dev_filled).sum(axis=0)                              # weighted deviations
    rated_mask = (~dev.isna()).to_numpy(dtype=np.float32)
    den = (np.abs(w) * rated_mask).sum(axis=0) + shrink             # shrink to stabilize
    pred_dev = num / den

    base = float(user_means.loc[target_uid])
    preds = pd.Series(base + pred_dev, index=UI.columns)
    return preds.clip(lower=1.0, upper=5.0)

def recommend_top_unseen(target_uid, top_n=10, k=30, min_overlap=5, shrink=10.0):
    """Return top-N highest predicted ratings for movies the user hasn't rated yet."""
    preds = _predict_all_items_for_user(target_uid, k=k, min_overlap=min_overlap, shrink=shrink)
    unseen = UI.loc[target_uid][UI.loc[target_uid].isna()].index    # movies not rated by user
    top = preds.loc[unseen].sort_values(ascending=False).head(top_n).reset_index()
    top.columns = ["movie_id", "pred_rating"]
    # attach titles if available
    return top.merge(movies, on="movie_id", how="left")[["movie_id", "title", "pred_rating"]]

# --- Example ---
user_id = int(UI.index[0])    # or set a specific user id, e.g., 100
recs = recommend_top_unseen(user_id, top_n=10, k=40, min_overlap=5, shrink=15.0)
print(f"Top recommendations for user {user_id}:")
print(recs)


Top recommendations for user 1:
   movie_id                                              title  pred_rating
0       318                            Schindler's List (1993)     3.837789
1       483                                  Casablanca (1942)     3.827525
2       651                                       Glory (1989)     3.821201
3       474  Dr. Strangelove or: How I Learned to Stop Worr...     3.791263
4       408                              Close Shave, A (1995)     3.790018
5       357             One Flew Over the Cuckoo's Nest (1975)     3.782414
6       603                                 Rear Window (1954)     3.774031
7       302                           L.A. Confidential (1997)     3.762339
8       484                         Maltese Falcon, The (1941)     3.762025
9       435          Butch Cassidy and the Sundance Kid (1969)     3.759227


In [None]:
# ===== Precision@K for User-based CF =====

# ----------------------------
# 1) Load ratings (+ titles)
# ----------------------------
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)[["user_id", "movie_id", "rating"]]

movies = pd.read_csv(
    "u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movie_id", "title"]
)

# -----------------------------------------
# 2) Build a per-user train/test holdout
# -----------------------------------------
min_interactions = 10     # users must have at least this many ratings to evaluate
n_holdout = 5             # hold out 5 positives per user
rng = np.random.default_rng(42)

# keep only users with enough interactions
counts = ratings.groupby("user_id")["movie_id"].count()
eligible_users = counts[counts >= (min_interactions + n_holdout)].index

# sample holdout positives per user
def sample_holdout(g):
    # g is the group (one user's ratings)
    test_idx = rng.choice(g.index.values, size=n_holdout, replace=False)
    g = g.copy()
    g["is_test"] = False
    g.loc[test_idx, "is_test"] = True
    return g

split = (
    ratings[ratings["user_id"].isin(eligible_users)]
    .groupby("user_id", group_keys=False)
    .apply(sample_holdout)
)

train = split[~split["is_test"]].drop(columns=["is_test"])
test  = split[ split["is_test"]].drop(columns=["is_test"])

# map test positives per user as a set of movie_ids
test_pos = test.groupby("user_id")["movie_id"].apply(set)

# ---------------------------------------------------------
# 3) Build user–item matrix & similarity from TRAIN only
# ---------------------------------------------------------
UI = train.pivot_table(index="user_id", columns="movie_id", values="rating")
user_means = UI.mean(axis=1)
UI_centered = UI.sub(user_means, axis=0)

UIc_filled = UI_centered.fillna(0.0).astype(np.float32)
user_sim = pd.DataFrame(
    cosine_similarity(UIc_filled.values),
    index=UI.index, columns=UI.index
)

# ---------------------------------------------------------
# 4) Recommender (same logic, but using TRAIN matrices)
# ---------------------------------------------------------
def _nearest_neighbors(target_uid, k=30, min_overlap=5):
    if target_uid not in UI.index:
        return pd.Series(dtype=float)  # user not in train (shouldn't happen with our filter)
    target_rated = UI.loc[target_uid].notna()
    overlap = UI.notna().dot(target_rated.astype(int))  # users × items → overlap counts
    sims = user_sim.loc[target_uid].drop(index=target_uid)
    mask = (overlap.drop(index=target_uid) >= min_overlap) & (sims > 0)
    return sims[mask].sort_values(ascending=False).head(k)

def _predict_all_items_for_user(target_uid, k=30, min_overlap=5, shrink=10.0):
    nbrs = _nearest_neighbors(target_uid, k=k, min_overlap=min_overlap)
    if nbrs.empty:
        # fallback: global mean for all items in train
        return pd.Series(train["rating"].mean(), index=UI.columns)

    N = nbrs.index
    w = nbrs.values.astype(np.float32)[:, None]                # n_neighbors × 1
    neigh_means = user_means.loc[N]
    dev = UI.loc[N].sub(neigh_means, axis=0)                   # neighbor-centered

    dev_filled = dev.fillna(0.0).to_numpy(dtype=np.float32)    # n_neighbors × n_items
    num = (w * dev_filled).sum(axis=0)
    rated_mask = (~dev.isna()).to_numpy(dtype=np.float32)
    den = (np.abs(w) * rated_mask).sum(axis=0) + shrink
    pred_dev = num / den

    base = float(user_means.loc[target_uid])
    preds = pd.Series(base + pred_dev, index=UI.columns)
    return preds.clip(lower=1.0, upper=5.0)

def recommend_top_unseen_train(target_uid, top_n=10, k=30, min_overlap=5, shrink=10.0):
    preds = _predict_all_items_for_user(target_uid, k=k, min_overlap=min_overlap, shrink=shrink)
    # exclude items already rated in TRAIN
    already = UI.loc[target_uid].dropna().index if target_uid in UI.index else pd.Index([])
    candidates = preds.drop(index=already, errors="ignore")
    top = candidates.sort_values(ascending=False).head(top_n).reset_index()
    top.columns = ["movie_id", "pred_rating"]
    return top

# ---------------------------------------------------------
# 5) Precision@K evaluation
# ---------------------------------------------------------
def precision_at_k_for_user(uid, K=10, k_nbrs=30, min_overlap=5, shrink=10.0):
    """Precision@K: (# held-out positives in top-K) / K."""
    if uid not in test_pos:   # user not in eval set
        return np.nan
    top = recommend_top_unseen_train(uid, top_n=K, k=k_nbrs, min_overlap=min_overlap, shrink=shrink)
    hit_set = test_pos[uid]
    hits = top["movie_id"].isin(hit_set).sum()
    return hits / float(K)

def evaluate_precision_at_k(K_list=(5, 10, 20), k_nbrs=40, min_overlap=5, shrink=15.0):
    users = sorted(test_pos.index.intersection(UI.index))
    results = {}
    for K in K_list:
        vals = [precision_at_k_for_user(u, K=K, k_nbrs=k_nbrs, min_overlap=min_overlap, shrink=shrink)
                for u in users]
        vals = pd.Series(vals, index=users).dropna()
        results[K] = {
            "mean_precision": float(vals.mean()),
            "median_precision": float(vals.median()),
            "users_evaluated": int(vals.shape[0])
        }
    return pd.DataFrame(results).T

# -------------------------
# 6) Run the evaluation
# -------------------------
report = evaluate_precision_at_k(K_list=(5, 10, 20), k_nbrs=40, min_overlap=5, shrink=15.0)
print("\nPrecision@K report (user-based CF):")
print(report)

# Optional: preview some recs for a sample user and see which held-out items hit
sample_user = report.index.name  # ignore; just pick one user explicitly:
sample_user = test_pos.index[0]
preview = recommend_top_unseen_train(sample_user, top_n=10, k=40, min_overlap=5, shrink=15.0)
preview = preview.merge(movies, on="movie_id", how="left")
preview["is_holdout_positive"] = preview["movie_id"].isin(test_pos[sample_user])
print(f"\nTop-10 recommendations for user {sample_user} (✓ = in holdout):")
print(preview[["movie_id", "title", "pred_rating", "is_holdout_positive"]])


  .apply(sample_holdout)



Precision@K report (user-based CF):
    mean_precision  median_precision  users_evaluated
5         0.091198              0.00            943.0
10        0.068611              0.10            943.0
20        0.047402              0.05            943.0

Top-10 recommendations for user 1 (✓ = in holdout):
   movie_id                                              title  pred_rating  \
0       173                         Princess Bride, The (1987)     3.867249   
1       318                            Schindler's List (1993)     3.822474   
2       651                                       Glory (1989)     3.817047   
3       483                                  Casablanca (1942)     3.810419   
4       357             One Flew Over the Cuckoo's Nest (1975)     3.772603   
5       408                              Close Shave, A (1995)     3.767138   
6       474  Dr. Strangelove or: How I Learned to Stop Worr...     3.766849   
7       603                                 Rear Window (1954)

In [None]:
# ===== Item-based CF=====

# 1) Load ratings (+ titles)
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)[["user_id", "movie_id", "rating"]]

movies = pd.read_csv(
    "u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movie_id", "title"]
)

# 2) Build user–item matrix
UI = ratings.pivot_table(index="user_id", columns="movie_id", values="rating")

# 3) Adjusted-cosine: center by *user* mean (to remove user bias)
user_means = UI.mean(axis=1)
UI_centered = UI.sub(user_means, axis=0)

# 4) Compute ITEM–ITEM cosine similarity on centered data
#    (fill NaNs with 0, since missing entries mean "no rating")
IIc = UI_centered.fillna(0.0).astype(np.float32)
item_sim = pd.DataFrame(
    cosine_similarity(IIc.T.values),   # similarity among columns (items)
    index=UI.columns, columns=UI.columns
)

# ---------- Core prediction (item-based) ----------
def predict_user_scores_item_based(target_uid, k=50, shrink=10.0, min_sim=0.0):
    """
    Predict ratings for all items for a given user using item-based CF:
    pred(j) = mean_user + sum_i( sim(j,i) * dev_user(i) ) / sum_i( |sim(j,i)| )
    where dev_user(i) = rating_user(i) - mean_user.
    - k: use top-k similar *neighbor items* per candidate item
    - shrink: stability term when few neighbors exist
    - min_sim: ignore neighbors with similarity below this
    Returns: pandas Series (index = movie_id) of predicted ratings (clipped to 1..5)
    """
    if target_uid not in UI.index:
        raise ValueError(f"user_id {target_uid} not found.")

    # user’s ratings & deviations
    user_row = UI.loc[target_uid]
    rated_items = user_row.dropna()
    if rated_items.empty:
        # cold start fallback: global item mean
        global_mean = ratings["rating"].mean()
        return pd.Series(global_mean, index=UI.columns)

    mu_u = float(user_means.loc[target_uid])
    dev_u = rated_items - mu_u  # Series indexed by item_id the user rated

    # For every candidate item j (including rated ones; we'll drop them later):
    preds = {}
    # Pre-extract for speed
    sim_sub = item_sim.loc[:, rated_items.index]  # similarities from *all items* to user's rated items

    for j in item_sim.index:
        # similarities between j and items user rated
        s = sim_sub.loc[j]
        # filter by min_sim and drop self if present
        s = s[s.index != j]
        s = s[s.abs() >= min_sim]
        if s.empty:
            preds[j] = mu_u  # fallback to user's mean
            continue

        # take top-k by absolute similarity (strongest signals)
        s = s.reindex(s.abs().sort_values(ascending=False).head(k).index)
        # align with dev_u (should already match indices)
        dev = dev_u.reindex(s.index)

        # weighted deviation
        num = (s.values * dev.values).sum()
        den = np.abs(s.values).sum() + shrink
        preds[j] = mu_u + num / den

    # Clip to the rating scale (MovieLens 100K: 1..5)
    preds = pd.Series(preds, index=item_sim.index).clip(lower=1.0, upper=5.0)
    return preds

def recommend_item_based(target_uid, top_n=10, k=50, shrink=10.0, min_sim=0.0):
    """
    Recommend top-N *unseen* items for the user using item-based CF predictions.
    """
    preds = predict_user_scores_item_based(target_uid, k=k, shrink=shrink, min_sim=min_sim)
    # Exclude already-rated movies
    seen = UI.loc[target_uid].dropna().index
    recs = preds.drop(index=seen, errors="ignore").sort_values(ascending=False).head(top_n).reset_index()
    recs.columns = ["movie_id", "pred_rating"]
    return recs.merge(movies, on="movie_id", how="left")[["movie_id", "title", "pred_rating"]]

# ===== Example =====
example_user = int(UI.index[0])  # pick an existing user_id
recs = recommend_item_based(example_user, top_n=10, k=75, shrink=15.0, min_sim=0.05)
print(f"\nTop item-based recommendations for user {example_user}:")
print(recs)



Top item-based recommendations for user 1:
   movie_id                                              title  pred_rating
0       483                                  Casablanca (1942)     4.043228
1       603                                 Rear Window (1954)     4.035880
2       479                                     Vertigo (1958)     4.026748
3       357             One Flew Over the Cuckoo's Nest (1975)     4.022097
4       408                              Close Shave, A (1995)     4.014354
5       651                                       Glory (1989)     4.001076
6       511                          Lawrence of Arabia (1962)     3.998182
7       435          Butch Cassidy and the Sundance Kid (1969)     3.997117
8       302                           L.A. Confidential (1997)     3.997114
9       474  Dr. Strangelove or: How I Learned to Stop Worr...     3.995288


In [None]:
# ===== Matrix Factorization (FunkSVD with biases, SGD) =====

# 1) Load ratings (+ titles)
ratings = pd.read_csv(
    "u.data", sep="\t", header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)[["user_id", "movie_id", "rating"]]

movies = pd.read_csv(
    "u.item", sep="|", header=None, encoding="latin-1",
    usecols=[0, 1], names=["movie_id", "title"]
)

# Map raw ids → contiguous indices (0..n_users-1 / 0..n_items-1)
user_ids = ratings["user_id"].unique()
item_ids = ratings["movie_id"].unique()
uid_map = {u:i for i,u in enumerate(sorted(user_ids))}
iid_map = {m:i for i,m in enumerate(sorted(item_ids))}
rid_map = {i:u for u,i in uid_map.items()}
mid_map = {i:m for m,i in iid_map.items()}

ratings["u_idx"] = ratings["user_id"].map(uid_map)
ratings["i_idx"] = ratings["movie_id"].map(iid_map)

# 2) Train/test split on observed ratings (no leakage)
train_df, test_df = train_test_split(
    ratings, test_size=0.1, random_state=42, stratify=ratings["user_id"]
)

n_users = len(uid_map)
n_items = len(iid_map)
print(f"Users={n_users}, Items={n_items}, Train={len(train_df)}, Test={len(test_df)}")

# 3) FunkSVD with biases (SGD)
class FunkSVD:
    def __init__(self, n_factors=50, n_epochs=20, lr=0.01, reg=0.02, seed=42):
        self.k = n_factors
        self.n_epochs = n_epochs
        self.lr = lr
        self.reg = reg
        self.rng = np.random.default_rng(seed)

    def fit(self, df, n_users, n_items):
        self.n_users = n_users
        self.n_items = n_items

        self.mu = df["rating"].mean()
        self.bu = np.zeros(n_users, dtype=np.float32)
        self.bi = np.zeros(n_items, dtype=np.float32)
        self.P = 0.1 * self.rng.standard_normal((n_users, self.k), dtype=np.float32)
        self.Q = 0.1 * self.rng.standard_normal((n_items, self.k), dtype=np.float32)

        u = df["u_idx"].to_numpy(np.int32)
        i = df["i_idx"].to_numpy(np.int32)
        r = df["rating"].to_numpy(np.float32)

        for epoch in range(self.n_epochs):
            # shuffle samples
            idx = self.rng.permutation(len(df))
            u_sh, i_sh, r_sh = u[idx], i[idx], r[idx]

            for uu, ii, rr in zip(u_sh, i_sh, r_sh):
                pred = self.mu + self.bu[uu] + self.bi[ii] + np.dot(self.P[uu], self.Q[ii])
                err = rr - pred

                # cache factors
                Pu = self.P[uu]
                Qi = self.Q[ii]

                # updates
                self.bu[uu] += self.lr * (err - self.reg * self.bu[uu])
                self.bi[ii] += self.lr * (err - self.reg * self.bi[ii])
                self.P[uu]  += self.lr * (err * Qi - self.reg * Pu)
                self.Q[ii]  += self.lr * (err * Pu - self.reg * Qi)

            # quick RMSE on train each epoch
            if (epoch + 1) % 5 == 0 or epoch == 0:
                rmse_tr = self.rmse(df)
                print(f"Epoch {epoch+1:02d}/{self.n_epochs} | Train RMSE: {rmse_tr:.4f}")

        return self

    def predict_pair(self, u_idx, i_idx):
        # clip to 1..5 for MovieLens
        x = (self.mu + self.bu[u_idx] + self.bi[i_idx] +
             np.dot(self.P[u_idx], self.Q[i_idx]))
        return float(np.clip(x, 1.0, 5.0))

    def rmse(self, df):
        u = df["u_idx"].to_numpy(np.int32)
        i = df["i_idx"].to_numpy(np.int32)
        r = df["rating"].to_numpy(np.float32)
        preds = self.mu + self.bu[u] + self.bi[i] + np.sum(self.P[u] * self.Q[i], axis=1)
        preds = np.clip(preds, 1.0, 5.0)
        return float(np.sqrt(np.mean((r - preds) ** 2)))

    def full_user_scores(self, u_idx):
        # vectorized scores for one user over all items
        scores = self.mu + self.bu[u_idx] + self.bi + self.P[u_idx] @ self.Q.T
        return np.clip(scores, 1.0, 5.0)

# 4) Train the model
svd = FunkSVD(n_factors=50, n_epochs=20, lr=0.01, reg=0.05, seed=42).fit(
    train_df, n_users=n_users, n_items=n_items
)

# 5) Evaluate RMSE on test set
rmse_te = svd.rmse(test_df)
print(f"\nTest RMSE: {rmse_te:.4f}")

# 6) Recommend top-N unseen movies for a given user
def recommend_for_user_svd(raw_user_id, top_n=10):
    if raw_user_id not in uid_map:
        raise ValueError("Unknown user_id")

    u_idx = uid_map[raw_user_id]

    # Find items the user has rated (in all data)
    seen_items = ratings.loc[ratings["user_id"] == raw_user_id, "i_idx"].unique()
    seen_mask = np.zeros(n_items, dtype=bool)
    seen_mask[seen_items] = True

    # Predict scores for all items then remove seen
    scores = svd.full_user_scores(u_idx)
    scores[seen_mask] = -np.inf

    top_iidx = np.argpartition(-scores, top_n)[:top_n]
    top_iidx = top_iidx[np.argsort(-scores[top_iidx])]
    top_movie_ids = [mid_map[i] for i in top_iidx]
    top_scores = scores[top_iidx]

    out = pd.DataFrame({"movie_id": top_movie_ids, "pred_rating": top_scores})
    out = out.merge(movies, on="movie_id", how="left")[["movie_id", "title", "pred_rating"]]
    return out

# ===== Example =====
example_user = int(ratings["user_id"].iloc[0])
recs = recommend_for_user_svd(example_user, top_n=10)
print(f"\nTop SVD-based recommendations for user {example_user}:")
print(recs)


Users=943, Items=1682, Train=90000, Test=10000
Epoch 01/20 | Train RMSE: 0.9570
Epoch 05/20 | Train RMSE: 0.8922
Epoch 10/20 | Train RMSE: 0.8265
Epoch 15/20 | Train RMSE: 0.7419
Epoch 20/20 | Train RMSE: 0.6666

Test RMSE: 0.9095

Top SVD-based recommendations for user 196:
   movie_id                                    title  pred_rating
0        64         Shawshank Redemption, The (1994)     4.755151
1       318                  Schindler's List (1993)     4.668643
2       603                       Rear Window (1954)     4.573271
3      1449                   Pather Panchali (1955)     4.516722
4       197                     Graduate, The (1967)     4.495232
5       313                           Titanic (1997)     4.489260
6       963  Some Folks Call It a Sling Blade (1993)     4.453530
7        69                      Forrest Gump (1994)     4.444258
8       480                North by Northwest (1959)     4.423979
9       180                    Apocalypse Now (1979)     4.41215