In [1]:
import pandas as pd

# 1. 加载数据
df = pd.read_csv("riiid-test-answer-prediction/train.csv")
df = df[df['content_type_id'] == 0]  # 只保留题目部分

# 2. 计算题目难度：P = R / N
difficulty = df.groupby("content_id")["answered_correctly"].agg(["mean", "count"])
difficulty.columns = ["difficulty", "total_responses"]

# 3. 计算用户的总体准确率（代表能力）
user_score = df.groupby("user_id")["answered_correctly"].mean().reset_index()
user_score.columns = ["user_id", "user_accuracy"]
df = df.merge(user_score, on="user_id", how="left")

# 4. 极端组法：按 user_accuracy 排序分组（前27%高分组，后27%低分组）
high_threshold = user_score["user_accuracy"].quantile(0.73)
low_threshold = user_score["user_accuracy"].quantile(0.27)

df["group"] = df["user_accuracy"].apply(lambda x: "high" if x >= high_threshold else ("low" if x <= low_threshold else "mid"))
high_group = df[df["group"] == "high"]
low_group = df[df["group"] == "low"]

# 5. 计算区分度 D = P_high - P_low
high_acc = high_group.groupby("content_id")["answered_correctly"].mean()
low_acc = low_group.groupby("content_id")["answered_correctly"].mean()
discrimination = (high_acc - low_acc).reset_index()
discrimination.columns = ["content_id", "discrimination"]

# 6. 合并难度与区分度结果
question_analysis = difficulty.reset_index().merge(discrimination, on="content_id", how="left")

# 7. 标注题目分类（可选）
question_analysis["quality_flag"] = question_analysis.apply(
    lambda row: "推荐" if 0.3 <= row["difficulty"] <= 0.7 and row["discrimination"] >= 0.3 else "需审查", axis=1
)

# 8. 导出结果
question_analysis.to_csv("question_difficulty_discrimination.csv", index=False)
