In [None]:
import json
import time
import os
from openai import OpenAI

# === API SETUP ===
api_key = os.getenv("OPENAI_API_KEY", "your-api-key-here")
client = OpenAI(api_key=api_key)

# === UPDATED FILES ===
INPUT_FILE = "responses_seniors_llama2_vs_mistral.jsonl"
OUTPUT_FILE = "scored_seniors_llama2_vs_mistral.jsonl"
ERROR_FILE = "scoring_errors_seniors_llama2_vs_mistral.jsonl"

# === SCORING RUBRIC ===
rubric = """
You are an impartial evaluator grading chatbot responses for how well they support senior (65+) users 
in understanding digital consent. You must score each response along EXACTLY these five dimensions:

1. Plain-Language Clarity (0–5)
   - Are words simple?
   - Are sentences short?
   - Is technical jargon avoided?
   - Reading ease ~6th–7th grade level.

2. Visual Accessibility (0–5)
   - Does the response acknowledge or encourage large buttons, high contrast, spacing, or easy-to-see structure?

3. Interaction Simplicity (0–5)
   - Does the response simplify steps?
   - Does it reduce cognitive load?
   - Are instructions linear and easy to follow?

4. Confirm-Step Safety (0–5)
   - Does the response mention multi-step confirmation?
   - Does it prevent accidental consent?
   - Does it offer “go back / change your choice” assurances?

5. Empathy & Senior-Friendly Tone (0–5)
   - Warm, patient, reassuring tone.
   - Avoids anxiety, pressure, or rushed wording.
   - Addresses potential memory or comprehension challenges.

Your output MUST be valid JSON only. No commentary.

You MUST output:
{
  "plain_language": X,
  "visual_accessibility": X,
  "interaction_simplicity": X,
  "confirm_steps": X,
  "empathy_tone": X,
  "weighted_score": X
}

Use these weights to compute weighted_score:
- Plain language: 0.30
- Visual accessibility: 0.20
- Interaction simplicity: 0.20
- Confirm steps: 0.20
- Empathy tone: 0.10
"""

# === FIXED & ROBUST score_response() ===
def score_response(prompt, response):
    user_prompt = f"""
Evaluate the following response for senior-friendly consent UX.

Prompt:
{prompt}

Response:
{response}

Return ONLY a JSON object exactly in this structure:
{{
  "plain_language": 0-5,
  "visual_accessibility": 0-5,
  "interaction_simplicity": 0-5,
  "confirm_steps": 0-5,
  "empathy_tone": 0-5,
  "weighted_score": final_score_between_0_and_5
}}
"""

    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            temperature=0.0,
            messages=[
                {"role": "system", "content": rubric},
                {"role": "user", "content": user_prompt}
            ]
        )

        reply = completion.choices[0].message.content.strip()

        # === ROBUST JSON PARSING START ===
        import re

        # Try direct JSON load
        try:
            return json.loads(reply)
        except json.JSONDecodeError:
            # Attempt to extract JSON object with regex
            match = re.search(r"\{[\s\S]*\}", reply)
            if match:
                try:
                    return json.loads(match.group(0))
                except Exception as e2:
                    raise RuntimeError(
                        f"GPT-4o scoring failed: JSON extraction unsuccessful. Raw reply: {reply}"
                    ) from e2
            else:
                raise RuntimeError(
                    f"GPT-4o scoring failed: no JSON object detected in reply. Raw reply: {reply}"
                )
        # === ROBUST JSON PARSING END ===

    except Exception as e:
        raise RuntimeError(f"GPT-4o scoring failed: {e}")


# === PROCESS THE FILE ===
with open(INPUT_FILE, "r") as fin, \
     open(OUTPUT_FILE, "w") as fout, \
     open(ERROR_FILE, "w") as ferr:

    for line in fin:
        if not line.strip():
            continue

        try:
            item = json.loads(line)
            prompt_id = item.get("prompt_id")
            gen_id = item.get("gen_id")
            prompt_text = item.get("prompt_text", "")
            response_text = item.get("response", "")

            print(f"Scoring {gen_id} ...", end=" ")

            if not response_text:
                raise ValueError("Empty model response.")

            score_data = score_response(prompt_text, response_text)
            item["score"] = score_data

            fout.write(json.dumps(item, ensure_ascii=False) + "\n")
            print("✓ OK")

        except Exception as e:
            ferr.write(json.dumps({
                "prompt_id": item.get("prompt_id"),
                "gen_id": item.get("gen_id"),
                "error": str(e)
            }) + "\n")
            print(f"⚠ Error: {e}")

        time.sleep(1)

print(f"\n Done. Saved scored results to: {OUTPUT_FILE}")
print(f" Errors (if any) saved to: {ERROR_FILE}")



Scoring SENIOR_P1_llama2_gen1 ... ✓ OK
Scoring SENIOR_P1_mistral_gen1 ... ✓ OK
Scoring SENIOR_P1_llama2_gen2 ... ✓ OK
Scoring SENIOR_P1_mistral_gen2 ... ✓ OK
Scoring SENIOR_P1_llama2_gen3 ... ✓ OK
Scoring SENIOR_P1_mistral_gen3 ... ✓ OK
Scoring SENIOR_P2_llama2_gen1 ... ✓ OK
Scoring SENIOR_P2_mistral_gen1 ... ✓ OK
Scoring SENIOR_P2_llama2_gen2 ... ✓ OK
Scoring SENIOR_P2_mistral_gen2 ... ✓ OK
Scoring SENIOR_P2_llama2_gen3 ... ✓ OK
Scoring SENIOR_P2_mistral_gen3 ... ✓ OK
Scoring SENIOR_P3_llama2_gen1 ... ✓ OK
Scoring SENIOR_P3_mistral_gen1 ... ✓ OK
Scoring SENIOR_P3_llama2_gen2 ... ✓ OK
Scoring SENIOR_P3_mistral_gen2 ... ✓ OK
Scoring SENIOR_P3_llama2_gen3 ... ✓ OK
Scoring SENIOR_P3_mistral_gen3 ... ✓ OK
Scoring SENIOR_P4_llama2_gen1 ... ✓ OK
Scoring SENIOR_P4_mistral_gen1 ... ✓ OK
Scoring SENIOR_P4_llama2_gen2 ... ✓ OK
Scoring SENIOR_P4_mistral_gen2 ... ✓ OK
Scoring SENIOR_P4_llama2_gen3 ... ✓ OK
Scoring SENIOR_P4_mistral_gen3 ... ✓ OK
Scoring SENIOR_P5_llama2_gen1 ... ✓ OK
Scoring SENIO