In [7]:
import pandas as pd
import json
import time
import openai
import os
from dotenv import load_dotenv

load_dotenv()

# ✅ 初始化 OpenAI 客户端（建议使用环境变量 OPENAI_API_KEY）
API_KEY = os.getenv("OPENAI_API_KEY")
client = openai.Client(api_key=API_KEY)

# ✅ 读取子问 + 背景 的 CSV
df = pd.read_csv("subquestions_with_background.csv")

# ✅ 构造带背景的 prompt 模板

def build_prompt(full_question, subquestion):
    return f"""You are a statistics master student.

Here is the full background of the problem:
{full_question}

Now, please solve the following sub-question step-by-step using structured reasoning:

{subquestion}

For each step, return a JSON object with:
- step: the step number (or "final" if it's the final answer),
- desc: a short description of what you are doing,
- expr: a math expression if applicable,
- value: the computed result if any

Format the full output as a JSON array of steps.
Return ONLY a valid JSON array. No explanations. No markdown.
"""

# ✅ GPT 调用函数（同样支持 debug 保存 + 容错）
def call_gpt4o(prompt, qid, subid, model="gpt-4o"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        reply = response.choices[0].message.content.strip()

        # 清理 markdown
        if reply.startswith("```json"):
            reply = reply.removeprefix("```json").removesuffix("```").strip()
        elif reply.startswith("```"):
            reply = reply.removeprefix("```").removesuffix("```").strip()

        return json.loads(reply)

    except json.JSONDecodeError as jde:
        print(f"❌ JSON 解析失败（{qid}-{subid}）：", jde)
        return [{"step": "error", "desc": "Invalid JSON output", "raw": reply[:300]}]
    except Exception as e:
        print(f"❌ API 调用失败（{qid}-{subid}）：", e)
        return [{"step": "error", "desc": str(e)}]

# ✅ 主流程：逐子问处理
solutions = []

for _, row in df.iterrows():
    qid = row["qid"]
    subid = row["subid"]
    subq = row["subquestion"]
    fullq = row["full_question"]

    print(f"\n🧠 解题中：{qid}-{subid}")

    prompt = build_prompt(fullq, subq)
    steps = call_gpt4o(prompt, qid, subid)

    solutions.append({
        "qid": qid,
        "subid": subid,
        "subquestion": subq,
        "full_question": fullq,
        "steps": steps
    })

    time.sleep(2)

# ✅ 保存所有结果
with open("gpt4o_subquestion_solutions.json", "w") as f:
    json.dump(solutions, f, indent=2, ensure_ascii=False)

print("\n✅ 所有子问解题完毕！结果已保存为 gpt4o_subquestion_solutions.json")

import math

# ✅ 递归清洗 NaN 为 None（合法 JSON 的 null）
def clean_nan(obj):
    if isinstance(obj, float) and math.isnan(obj):
        return None
    elif isinstance(obj, list):
        return [clean_nan(x) for x in obj]
    elif isinstance(obj, dict):
        return {k: clean_nan(v) for k, v in obj.items()}
    else:
        return obj

# ✅ 应用于整个 solutions 列表
cleaned_solutions = clean_nan(solutions)

# ✅ 保存为合法 JSON 文件
with open("gpt4o_subquestion_solutions.json", "w") as f:
    json.dump(cleaned_solutions, f, indent=2, ensure_ascii=False)

print("\n✅ 清洗后保存完毕（NaN → null），文件已生成：gpt4o_subquestion_solutions.json")


🧠 解题中：Problemone-nan

🧠 解题中：Problemtwo-1

🧠 解题中：Problemtwo-2

🧠 解题中：Problemtwo-3

🧠 解题中：Problemthree-a

🧠 解题中：Problemthree-b

🧠 解题中：Problemthree-c

🧠 解题中：Problemfour-a

🧠 解题中：Problemfour-b

🧠 解题中：Problemfour-c

🧠 解题中：Problemfour-d

🧠 解题中：Problemfour-e

🧠 解题中：Problemfour-f

🧠 解题中：Problemfour-g

🧠 解题中：Problemfour-h

🧠 解题中：Problemfour-i

🧠 解题中：Problemfour-j

🧠 解题中：Problemfour-k

✅ 所有子问解题完毕！结果已保存为 gpt4o_subquestion_solutions.json

✅ 清洗后保存完毕（NaN → null），文件已生成：gpt4o_subquestion_solutions.json


In [None]:
# 加载 GPT 解题输出
with open("gpt4o_subquestion_solutions.json") as f:
    solutions = json.load(f)

# 评分 prompt
def build_grading_prompt(subquestion, steps_json):
    steps_str = json.dumps(steps_json, indent=2, ensure_ascii=False)
    return f"""You are a statistics tutor. Please grade a student's step-by-step solution to a sub-question.

Sub-question:
{subquestion}

Student's steps:
{steps_str}

Now, do the following:
1. Evaluate if each step is correct or flawed. If flawed, explain why.
2. Give a short comment for each step.
3. Give an overall score out of 5 and a short feedback.
4. Use the following scoring rubric:
   1 - Completely incorrect: Major logical flaws, fundamental misunderstandings, or missing core steps. Lacks basic understanding.
   2 - Weak: Some grasp of the method, but contains multiple errors, flawed reasoning, or incoherent structure.
   3 - Satisfactory: Main method is correct, includes key steps, but has some calculation or explanation issues.
   4 - Good: Mostly correct, logically structured, only minor issues such as small errors or slightly informal reasoning.
   5 - Excellent: Fully correct, well-organized, rigorous and clear reasoning. A model solution.
5. Format your output as the following JSON:

{{
  "score": X,
  "total": 5,
  "feedback": "...",
  "step_feedback": [{{"step": ..., "comment": "..."}}, ...]
}}
Return ONLY a valid JSON object. No markdown, no extra explanation.
"""

# GPT 评分函数
def grade_with_gpt(question, steps, qid, subid, model="gpt-4o"):
    prompt = build_grading_prompt(question, steps)
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        reply = response.choices[0].message.content.strip()
        if reply.startswith("```json"):
            reply = reply.removeprefix("```json").removesuffix("```").strip()
        elif reply.startswith("```"):
            reply = reply.removeprefix("```").removesuffix("```").strip()

        return json.loads(reply)

    except Exception as e:
        print(f"❌ GPT 评分失败（{qid}-{subid}）：", e)
        return {"score": 0, "total": 5, "feedback": f"Error: {str(e)}", "step_feedback": []}

# 批量评分
graded_results = []
for item in solutions:
    qid = item["qid"]
    subid = item["subid"]
    question = item["subquestion"]
    steps = item["steps"]

    print(f"📝 正在评分：{qid}-{subid}")
    grading = grade_with_gpt(question, steps, qid, subid)

    graded_results.append({
        "qid": qid,
        "subid": subid,
        "subquestion": question,
        "score": grading.get("score", 0),
        "total": grading.get("total", 5),
        "feedback": grading.get("feedback", ""),
        "step_feedback": grading.get("step_feedback", [])
    })

    time.sleep(2)

# 保存
with open("gpt4o_grading_results.json", "w") as f:
    json.dump(graded_results, f, indent=2, ensure_ascii=False)

print("✅ 所有子问评分完成，保存为 gpt4o_grading_results.json")


📝 正在评分：Problemone-None
📝 正在评分：Problemtwo-1
📝 正在评分：Problemtwo-2
📝 正在评分：Problemtwo-3
📝 正在评分：Problemthree-a
📝 正在评分：Problemthree-b
📝 正在评分：Problemthree-c
📝 正在评分：Problemfour-a
📝 正在评分：Problemfour-b
📝 正在评分：Problemfour-c
📝 正在评分：Problemfour-d
📝 正在评分：Problemfour-e
📝 正在评分：Problemfour-f
📝 正在评分：Problemfour-g
📝 正在评分：Problemfour-h


In [None]:
# 读取评分结果
with open("gpt4o_grading_results.json") as f:
    graded = json.load(f)

# 筛选出不是满分的子问
wrong = [item for item in graded if item.get("score", 0) < item.get("total", 5)]

# 保存为 json（可供后续重判或展示）
with open("wrong_subquestions.json", "w") as f:
    json.dump(wrong, f, indent=2, ensure_ascii=False)

print(f"✅ 筛选出 {len(wrong)} 道错题，保存为 wrong_subquestions.json")