In [None]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp312-cp312-linux_x86_64.whl size=2555924 sha256=686d22a538f3fe71ce63999aacb444f08bc6040d5d5c57f486ad7a7a6adab50e
  Stored in directory: /root/.cache/pip/wheels/75/fa/bc/739bc2cb1

In [5]:
# ============================================================
# RECOMMENDER v4 — Hybrid (Embeddings + Rules) — FIXED & FAST
# Single-cell, NumPy 2.x safe
# ============================================================

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# ---------------- CONFIG ----------------
INTERACTIONS_F = "model_training_data.csv"
QUESTION_FEAT_F = "question_features_synthetic.csv"
OUT_RECS_F = "recommendations_v4.csv"

TOP_K = 5
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
EMB_BATCH = 256

W_SIM, W_MAST, W_POP, W_REC = 0.45, 0.30, 0.15, 0.10

# ---------------- LOAD DATA ----------------
print("Loading data...")
df = pd.read_csv(INTERACTIONS_F)
qf = pd.read_csv(QUESTION_FEAT_F)

df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
df["question_id"] = pd.to_numeric(df["question_id"], errors="coerce").astype("Int64")
qf["question_id"] = pd.to_numeric(qf["question_id"], errors="coerce").astype("Int64")

df = df.dropna(subset=["student_id","question_id"])
qf = qf.dropna(subset=["question_id"])

if "synthetic_question_text" in qf.columns:
    qf["question_text"] = qf["synthetic_question_text"]

qf = qf.drop_duplicates("question_id").set_index("question_id")

# ---------------- FEATURE NORMALIZATION ----------------
qf["difficulty_score"] = qf["difficulty_score"].fillna(qf["difficulty_score"].median())
qf["attempt_count"] = qf["attempt_count"].fillna(0)

last_seen = df.groupby("question_id")["timestamp"].max()
qf["recency_days"] = (pd.Timestamp.now() - last_seen).dt.days.fillna(9999)

scaler = MinMaxScaler()
qf[["difficulty_s","attempt_s","recency_s"]] = scaler.fit_transform(
    qf[["difficulty_score","attempt_count","recency_days"]]
)

# ---------------- EMBEDDINGS ----------------
print("Encoding question embeddings...")
model = SentenceTransformer(EMB_MODEL)

q_ids = qf.index.to_numpy()
texts = qf["question_text"].fillna("").astype(str).tolist()

embs = []
for i in tqdm(range(0, len(texts), EMB_BATCH)):
    embs.append(model.encode(texts[i:i+EMB_BATCH], convert_to_numpy=True))

q_emb = np.vstack(embs)
qid_to_idx = {int(q): i for i, q in enumerate(q_ids)}

# ---------------- USER MASTERY TABLE ----------------
user_skill_mastery = (
    df.groupby(["student_id","skill"])["mastery_score"]
    .mean()
    .to_dict()
)

user_mean = df.groupby("student_id")["mastery_score"].mean()
global_mean = df["mastery_score"].mean()

skills_arr = qf["skill"].values
difficulty_arr = qf["difficulty_s"].values
attempt_arr = qf["attempt_s"].values
recency_arr = qf["recency_s"].values

# ---------------- STUDENT PROFILE ----------------
def student_profile(uid, n=20):
    hist = df[df.student_id == uid].sort_values("timestamp", ascending=False).head(n)
    if hist.empty:
        return q_emb.mean(axis=0)

    idxs, weights = [], []
    for _, r in hist.iterrows():
        idx = qid_to_idx.get(int(r.question_id))
        if idx is not None:
            idxs.append(idx)
            weights.append(0.7 * r.correctness + 0.3)

    vecs = q_emb[idxs]
    w = np.array(weights)[:, None]
    return (vecs * w).sum(axis=0) / w.sum()

# ---------------- FAST VECTOR RECOMMENDER ----------------
def recommend(uid, allow_seen=False):
    prof = student_profile(uid)
    sim = cosine_similarity(prof.reshape(1,-1), q_emb).flatten()

    # vectorized mastery gap
    user_mast = np.array([
        user_skill_mastery.get((uid, s), user_mean.get(uid, global_mean))
        for s in skills_arr
    ])
    mast_score = 1 - np.abs(difficulty_arr - user_mast)

    score = (
        W_SIM * sim +
        W_MAST * mast_score +
        W_POP * (1 - attempt_arr) +
        W_REC * recency_arr
    )

    if not allow_seen:
        seen = set(df[df.student_id == uid].question_id)
        mask = np.array([qid not in seen for qid in q_ids])
        score = np.where(mask, score, -1e9)

    top_idx = np.argsort(score)[-TOP_K:][::-1]

    return [{
        "question_id": int(q_ids[i]),
        "score": float(score[i]),
        "question_text": qf.iloc[i]["question_text"]
    } for i in top_idx]

# ---------------- EVALUATION (FIXED) ----------------
print("Evaluating...")
test_idx = df.groupby("student_id")["timestamp"].idxmax()
test = df.loc[test_idx]

hits = prec = rec = ndcg = ap = 0
n = len(test)

for _, row in tqdm(test.iterrows(), total=n):
    recs = recommend(row.student_id, allow_seen=True)
    ids = [r["question_id"] for r in recs]

    hit = int(row.question_id in ids)
    hits += hit
    prec += hit / TOP_K
    rec += hit

    if hit:
        rank = ids.index(row.question_id) + 1
        ndcg += 1 / np.log2(rank + 1)
        ap += 1 / rank

print({
    "Hit@5": hits/n,
    "Precision@5": prec/n,
    "Recall@5": rec/n,
    "NDCG@5": ndcg/n,
    "MAP@5": ap/n
})

# ---------------- GENERATE FINAL RECS ----------------
print("Generating recommendations...")
out = []
for uid in tqdm(df.student_id.unique()):
    out.append({
        "student_id": uid,
        "recommendations": recommend(uid)
    })

pd.DataFrame(out).to_csv(OUT_RECS_F, index=False)
print("Saved:", OUT_RECS_F)


Loading data...
Encoding question embeddings...


100%|██████████| 39/39 [00:39<00:00,  1.00s/it]


Evaluating...


100%|██████████| 2143/2143 [03:49<00:00,  9.34it/s]


{'Hit@5': 0.010732617825478302, 'Precision@5': 0.002146523565095661, 'Recall@5': 0.010732617825478302, 'NDCG@5': np.float64(0.005697768840887685), 'MAP@5': 0.00402084305490745}
Generating recommendations...


100%|██████████| 2143/2143 [04:05<00:00,  8.72it/s]


Saved: recommendations_v4.csv
