In [21]:
import numpy as np
import pandas as pd

# create Riiid-like dataset
np.random.seed(42)

N = 200_000

df = pd.DataFrame({
    "user_id": np.random.randint(0, 5000, N),
    "content_id": np.random.randint(0, 1000, N),
    "answered_correctly": np.random.randint(0, 2, N),
    "timestamp": np.random.randint(0, 100000, N),
    "prior_question_elapsed_time": np.random.randint(1000, 30000, N),
    "prior_question_had_explanation": np.random.choice([True, False], N)
})

df.sort_values(["user_id", "timestamp"], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,user_id,content_id,answered_correctly,timestamp,prior_question_elapsed_time,prior_question_had_explanation
0,0,227,1,6077,28037,False
1,0,758,0,10006,25809,False
2,0,49,0,11584,23429,True
3,0,982,1,13915,8655,False
4,0,390,0,14810,20392,False


In [22]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import roc_auc_score

# ===============================
# Feature engineering
# ===============================
WIN = 20
RANDOM_STATE = 42

q_stats = df.groupby("content_id")["answered_correctly"].agg(["mean", "count"])
q_stats.columns = ["q_correct_rate", "q_count"]
df = df.join(q_stats, on="content_id")

g = df.groupby("user_id", sort=False)
past = g["answered_correctly"].shift(1)

df["u_prev_correct"] = past.fillna(0)

df["u_accuracy"] = (
    past.groupby(df["user_id"])
        .rolling(WIN, min_periods=5)
        .mean()
        .reset_index(level=0, drop=True)
)
df["u_accuracy"] = df["u_accuracy"].fillna(df["u_accuracy"].mean())

df["u_attempts"] = g.cumcount()

df["u_delta"] = g["timestamp"].diff().fillna(0)
df["u_log_delta"] = np.log1p(df["u_delta"].clip(lower=0))

df["log_elapsed"] = np.log1p(df["prior_question_elapsed_time"].clip(lower=0))
df["prior_expl"] = df["prior_question_had_explanation"].astype(int)

FEATURES = [
    "q_correct_rate", "q_count",
    "u_prev_correct", "u_accuracy", "u_attempts",
    "u_log_delta", "log_elapsed", "prior_expl"
]

X = df[FEATURES].values.astype("float32")
y = df["answered_correctly"].values.astype("float32")

# ===============================
# Train / validation split (no leakage)
# ===============================
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_STATE)
train_idx, val_idx = next(splitter.split(X, y, groups=df["user_id"].values))

X_train, y_train = X[train_idx], y[train_idx]
X_val, y_val = X[val_idx], y[val_idx]

train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

train_loader = DataLoader(train_ds, batch_size=2048, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=4096)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===============================
# PyTorch logistic regression model
# ===============================
class LogisticModel(nn.Module):
    def __init__(self, d):
        super().__init__()
        self.fc = nn.Linear(d, 1)

    def forward(self, x):
        return self.fc(x).squeeze(1)

model = LogisticModel(X.shape[1]).to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

# ===============================
# Training loop
# ===============================
for epoch in range(5):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        loss = criterion(model(xb), yb)
        loss.backward()
        optimizer.step()

    model.eval()
    preds, ys = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            preds.append(torch.sigmoid(model(xb)).cpu().numpy())
            ys.append(yb.numpy())

    auc = roc_auc_score(np.concatenate(ys), np.concatenate(preds))
    print(f"Epoch {epoch+1} | Validation AUC: {auc:.4f}")

torch.save(model.state_dict(), "model.pt")

Epoch 1 | Validation AUC: 0.5018
Epoch 2 | Validation AUC: 0.4948
Epoch 3 | Validation AUC: 0.4950
Epoch 4 | Validation AUC: 0.4953
Epoch 5 | Validation AUC: 0.4953


In [24]:
def recommend_questions(model, user_state, candidate_ids, q_stats, target=0.7, top_k=10):
    cand = pd.DataFrame({"content_id": candidate_ids})
    cand = cand.join(q_stats, on="content_id")

    cand["q_correct_rate"] = cand["q_correct_rate"].fillna(q_stats["q_correct_rate"].mean())
    cand["q_count"] = cand["q_count"].fillna(1)

    feat = pd.DataFrame({
        "q_correct_rate": cand["q_correct_rate"],
        "q_count": cand["q_count"],
        "u_prev_correct": user_state["u_prev_correct"],
        "u_accuracy": user_state["u_accuracy"],
        "u_attempts": user_state["u_attempts"],
        "u_log_delta": user_state["u_log_delta"],
        "log_elapsed": user_state["log_elapsed"],
        "prior_expl": user_state["prior_expl"],
    })[FEATURES].values.astype("float32")

    model.eval()
    with torch.no_grad():
        xb = torch.from_numpy(feat).to(next(model.parameters()).device)
        probs = torch.sigmoid(model(xb)).cpu().numpy()

    cand["pred_p_correct"] = probs
    cand["gap_to_target"] = (cand["pred_p_correct"] - target).abs()

    return cand.sort_values("gap_to_target").head(top_k)[
        ["content_id", "pred_p_correct", "q_correct_rate", "q_count", "gap_to_target"]
    ]

In [25]:
# Create an example user snapshot from a real user in df
u = int(df["user_id"].iloc[-1])
u_hist = df[df["user_id"] == u].tail(50)

example_user = {
    "u_prev_correct": float(u_hist["answered_correctly"].iloc[-1]),
    "u_accuracy": float(u_hist["answered_correctly"].mean()),
    "u_attempts": float(len(u_hist)),  # FIX: do NOT use index.max()
    "u_log_delta": float(np.log1p(u_hist["timestamp"].diff().fillna(0).iloc[-1])),
    "log_elapsed": float(np.log1p(u_hist["prior_question_elapsed_time"].iloc[-1])),
    "prior_expl": float(u_hist["prior_question_had_explanation"].iloc[-1]),
}

# Candidate questions
candidate_ids = df["content_id"].sample(2000, random_state=RANDOM_STATE).values

# Run recommender (PyTorch version)
recs = recommend_questions(
    model,
    example_user,
    candidate_ids,
    q_stats,
    target=0.7,
    top_k=10
)

recs

Unnamed: 0,content_id,pred_p_correct,q_correct_rate,q_count,gap_to_target
1121,755,0.690746,0.428571,252,0.009254
699,174,0.661151,0.481633,245,0.038849
1183,174,0.661151,0.481633,245,0.038849
1342,174,0.661151,0.481633,245,0.038849
1436,40,0.647897,0.508264,242,0.052103
1826,40,0.647897,0.508264,242,0.052103
660,40,0.647897,0.508264,242,0.052103
481,40,0.647897,0.508264,242,0.052103
550,739,0.6447,0.485477,241,0.0553
972,739,0.6447,0.485477,241,0.0553


In [26]:
import os

# create folder
os.makedirs("outputs", exist_ok=True)

# save dataframe
recs.to_csv("outputs/recommendations.csv", index=False)

print("✅ Saved to outputs/recommendations.csv")

✅ Saved to outputs/recommendations.csv


In [27]:
from google.colab import files
files.download("outputs/recommendations.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>