# 03) Matrix Factorization (SVD-style) — MovieLens 100K

เป้าหมาย: เทรนโมเดลแบบ FunkSVD (มี bias) เพื่อพยากรณ์เรตติ้งและวัดผลด้วย RMSE + Precision@K/Recall@K

In [48]:
from pathlib import Path
import os, json
import numpy as np
import pandas as pd

if os.path.basename(os.getcwd()) == "notebooks":
    os.chdir("..")

PROJ = Path.cwd()
DATA = PROJ / "data"
RESULTS = PROJ / "results"
RESULTS.mkdir(exist_ok=True)

def find_final_csv():
    cands = [
        DATA / "final_data.csv",
        PROJ.parent / "data" / "final_data.csv",
        PROJ.parent.parent / "data" / "final_data.csv",
    ]
    for p in cands:
        if p.exists():
            return p
    raise FileNotFoundError("ไม่พบ data/final_data.csv — กลับไปรัน 01_data_cleaning.ipynb เพื่อ export ก่อน")

CSV_PATH = find_final_csv()
print("Using:", CSV_PATH.resolve())


Using: G:\BU\PhiphatD_Web_Portfolio\Recommendation-System-MovieLens-100K-\data\final_data.csv


# 3.1) Load dataset

โหลดข้อมูลและตรวจคอลัมน์สำคัญ

In [49]:
df = pd.read_csv(CSV_PATH)

# ถ้ายังไม่มี user_idx / item_idx ให้เข้ารหัสตอนนี้
if not {"user_idx","item_idx"}.issubset(df.columns):
    from sklearn.preprocessing import LabelEncoder
    uenc, ienc = LabelEncoder(), LabelEncoder()
    df["user_idx"] = uenc.fit_transform(df["user_id"])
    df["item_idx"] = ienc.fit_transform(df["item_id"])

# เก็บเฉพาะคอลัมน์ที่จำเป็นต่อโมเดล + ชื่อหนังเพื่อโชว์ผล
keep_cols = ["user_idx","item_idx","rating"]
if "movie_title" in df.columns:
    keep_cols += ["movie_title"]
df = df[keep_cols].copy()

# ชนิดข้อมูลที่เหมาะกับโมเดล (เร็ว/ประหยัดแรม)
df["user_idx"] = df["user_idx"].astype(np.int32)
df["item_idx"] = df["item_idx"].astype(np.int32)
df["rating"]   = df["rating"].astype(np.float32)

df.head()


Unnamed: 0,user_idx,item_idx,rating,movie_title
0,195,241,3.0,Kolya (1996)
1,185,301,3.0,L.A. Confidential (1997)
2,21,376,1.0,Heavyweights (1994)
3,243,50,2.0,Legends of the Fall (1994)
4,165,345,1.0,Jackie Brown (1997)


# 3.2) Quick sanity check

จำนวนผู้ใช้/หนัง/ความหนาแน่น rating

In [50]:
n_users = int(df["user_idx"].max()) + 1
n_items = int(df["item_idx"].max()) + 1
density = len(df) / (n_users * n_items) * 100

print(f"Users={n_users:,}, Items={n_items:,}, Ratings={len(df):,}, Density={density:.4f}%")
assert {"user_idx","item_idx","rating"}.issubset(df.columns)


Users=943, Items=1,682, Ratings=99,991, Density=6.3041%


# 3.3) Train/Test split แบบ per-user

คงสถานการณ์จริง: แบ่ง 80/20 ภายในแต่ละผู้ใช้ เพื่อลด data leakage

In [51]:
from sklearn.model_selection import train_test_split

splits = [train_test_split(g, test_size=0.2, random_state=42) for _, g in df.groupby("user_idx")]
train_df = pd.concat([p[0] for p in splits], ignore_index=True)
test_df  = pd.concat([p[1] for p in splits], ignore_index=True)

print("train:", train_df.shape, " test:", test_df.shape)


train: (79610, 4)  test: (20381, 4)


In [52]:
#เตรียมอาร์เรย์สำหรับโมเดล ให้ NumPy จัดการตรง ๆ
u_tr = train_df["user_idx"].to_numpy(np.int32)
i_tr = train_df["item_idx"].to_numpy(np.int32)
r_tr = train_df["rating"].to_numpy(np.float32)

u_te = test_df["user_idx"].to_numpy(np.int32)
i_te = test_df["item_idx"].to_numpy(np.int32)
r_te = test_df["rating"].to_numpy(np.float32)


In [53]:
def funk_svd(u_tr, i_tr, r_tr, n_users, n_items,
             n_factors=50, n_epochs=10, lr=0.005, reg=0.02, seed=42):
    rng = np.random.default_rng(seed)
    P  = 0.1 * rng.standard_normal((n_users, n_factors))  # user factors
    Q  = 0.1 * rng.standard_normal((n_items, n_factors))  # item factors
    bu = np.zeros(n_users, dtype=np.float32)              # user bias
    bi = np.zeros(n_items, dtype=np.float32)              # item bias
    mu = float(r_tr.mean())                               # global mean

    idx = np.arange(len(r_tr))
    for epoch in range(n_epochs):
        rng.shuffle(idx)
        for t in idx:
            u, i, r = int(u_tr[t]), int(i_tr[t]), float(r_tr[t])
            pred = mu + bu[u] + bi[i] + P[u] @ Q[i]
            err  = r - pred

            # อัปเดต bias
            bu[u] += lr * (err - reg*bu[u])
            bi[i] += lr * (err - reg*bi[i])

            # อัปเดต latent factors
            Pu, Qi = P[u].copy(), Q[i].copy()
            P[u] += lr * (err*Qi - reg*Pu)
            Q[i] += lr * (err*Pu - reg*Qi)
    return {"P":P, "Q":Q, "bu":bu, "bi":bi, "mu":mu}

def predict(params, u, i):
    P,Q,bu,bi,mu = params["P"],params["Q"],params["bu"],params["bi"],params["mu"]
    s = mu + bu[u] + bi[i] + P[u] @ Q[i]
    return float(np.clip(s, 1.0, 5.0))

def rmse(params, u, i, r):
    pred = np.array([predict(params, int(uu), int(ii)) for uu,ii in zip(u,i)], dtype=np.float32)
    return float(np.sqrt(np.mean((r - pred)**2)))


In [54]:
params = funk_svd(
    u_tr, i_tr, r_tr,
    n_users=n_users, n_items=n_items,
    n_factors=50, n_epochs=10, lr=0.005, reg=0.02, seed=42
)
rmse_test = rmse(params, u_te, i_te, r_te)
print(f"RMSE(test) = {rmse_test:.4f}")


RMSE(test) = 0.9435


In [55]:
def topk_for_user(params, u, seen_items=None, k=10):
    # คะแนนทุก item สำหรับ user u
    scores = params["mu"] + params["bu"][u] + params["bi"] + params["P"][u] @ params["Q"].T
    if seen_items:
        scores[list(seen_items)] = -np.inf  # ไม่แนะนำสิ่งที่เคยให้คะแนนแล้ว
    valid = np.isfinite(scores)
    k_eff = min(k, int(valid.sum()))
    idx = np.argpartition(-scores, k_eff-1)[:k_eff]
    return idx[np.argsort(-scores[idx])]

def precision_recall_at_k(params, train_df, test_df, k=10, threshold=4.0):
    seen = train_df.groupby("user_idx")["item_idx"].apply(set).to_dict()
    precisions, recalls = [], []
    for u, g in test_df.groupby("user_idx"):
        u = int(u)
        topk = set(topk_for_user(params, u, seen.get(u, set()), k))
        rel  = set(g.loc[g["rating"]>=threshold, "item_idx"].astype(int))
        if len(topk) == 0:
            continue
        tp = len(topk & rel)
        precisions.append(tp/len(topk))
        recalls.append(tp/max(len(rel),1))
    return float(np.mean(precisions)), float(np.mean(recalls))

P10, R10 = precision_recall_at_k(params, train_df, test_df, k=10, threshold=4.0)
print(f"Precision@10={P10:.4f}  Recall@10={R10:.4f}")


Precision@10=0.0635  Recall@10=0.0435


In [56]:
# mapping item_idx -> movie_title (ถ้ามี)
title_map = {}
if "movie_title" in df.columns:
    title_map = df.drop_duplicates("item_idx").set_index("item_idx")["movie_title"].to_dict()

sample_user = int(train_df.sample(1, random_state=0)["user_idx"].iloc[0])
seen_items = set(train_df.loc[train_df["user_idx"]==sample_user, "item_idx"].astype(int))
top_ids = topk_for_user(params, sample_user, seen_items, k=10)

print(f"User {sample_user} → Top-10")
for rnk, iid in enumerate(top_ids, 1):
    print(f"{rnk:2d}.", title_map.get(int(iid), f"item_{int(iid)}"))


User 649 → Top-10
 1. Schindler's List (1993)
 2. Wrong Trousers, The (1993)
 3. Close Shave, A (1995)
 4. Shawshank Redemption, The (1994)
 5. Wallace & Gromit: The Best of Aardman Animation (1996)
 6. Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
 7. Usual Suspects, The (1995)
 8. Raging Bull (1980)
 9. 12 Angry Men (1957)
10. Star Wars (1977)


In [57]:
np.savez_compressed(
    RESULTS / "svd_params.npz",
    P=params["P"], Q=params["Q"], bu=params["bu"], bi=params["bi"], mu=np.array([params["mu"]])
)
with open(RESULTS / "svd_eval.json", "w", encoding="utf-8") as f:
    json.dump({"rmse_test": rmse_test, "precision@10": P10, "recall@10": R10}, f, indent=2)
print("Saved:", (RESULTS / "svd_params.npz").name, (RESULTS / "svd_eval.json").name)


Saved: svd_params.npz svd_eval.json


In [61]:
grid = [
    dict(n_factors=150, n_epochs=40, lr=0.004, reg=0.03),
]
res = []
for hp in grid:
    p = funk_svd(u_tr, i_tr, r_tr, n_users, n_items, seed=7, **hp)
    rm = rmse(p, u_te, i_te, r_te)
    pr, rc = precision_recall_at_k(p, train_df, test_df, k=10, threshold=4.0)
    res.append((hp, rm, pr, rc))
    print(hp, "-> RMSE", f"{rm:.4f}", "| P@10", f"{pr:.4f}", "R@10", f"{rc:.4f}")

# เลือกชุดที่ชอบตามเมตริกที่ต้องการ (เช่น RMSE ต่ำสุด)
best = min(res, key=lambda x: x[1])
best


{'n_factors': 150, 'n_epochs': 40, 'lr': 0.004, 'reg': 0.03} -> RMSE 0.9333 | P@10 0.0768 R@10 0.0567


({'n_factors': 150, 'n_epochs': 40, 'lr': 0.004, 'reg': 0.03},
 0.9332975149154663,
 0.0767762460233298,
 0.0566557415775462)