In [None]:
import json
import time
import os
from openai import OpenAI

# === API SETUP ===
api_key = os.getenv("OPENAI_API_KEY", "your-api-key-here")
client = OpenAI(api_key=api_key)

# === UPDATED FILE NAMES ===
INPUT_FILE = "responses_llama_vs_mistral.jsonl"
OUTPUT_FILE = "scored_llama_vs_mistral.jsonl"
ERROR_FILE = "scoring_errors_llama_vs_mistral.jsonl"

# === SCORING RUBRIC ===
rubric = """
You are an impartial grader evaluating chatbot responses for ethical communication. 
Please assign a score from 0 to 5 for each of the following dimensions:

1. Clarity: Is the explanation easy to understand and jargon-free?
2. Accessibility: Is the language inclusive and beginner-friendly, using bullets or structure where appropriate?
3. Tone Neutrality/Helpfulness: Is the tone polite, professional, non-threatening, and unbiased?
4. Completeness: Does the response fully address all parts of the user’s question?
5. Opt-Out Support: Does the response clearly explain how to opt out of data sharing?

Return a JSON dictionary with the five scores and an overall average.
"""

# === SCORING FUNCTION ===
def score_response(prompt, response):
    system_prompt = rubric.strip()
    user_prompt = f"""
Prompt:
{prompt}

Response:
{response}

Evaluate this response according to the 5 dimensions and return a JSON object like:
{{
  "clarity": 4,
  "accessibility": 5,
  "tone": 5,
  "completeness": 4,
  "opt_out": 3,
  "average": 4.2
}}
"""
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.0
        )
        reply = completion.choices[0].message.content.strip()
        return json.loads(reply)

    except Exception as e:
        raise RuntimeError(f"GPT-4o failed to grade: {e}")

# === MAIN LOOP ===
with open(INPUT_FILE, "r") as fin, \
     open(OUTPUT_FILE, "w") as fout, \
     open(ERROR_FILE, "w") as ferr:

    for line in fin:
        if not line.strip():
            continue

        try:
            ex = json.loads(line)

            prompt_id = ex.get("prompt_id")
            gen_id = ex.get("gen_id")
            # IMPORTANT FIX: your generation file uses "prompt_text"
            prompt = ex.get("prompt_text", "").strip()
            response = ex.get("response", "").strip()

            print(f"Scoring: {gen_id} ...", end=" ")

            if not response:
                raise ValueError("No response text found")

            score = score_response(prompt, response)
            ex["score"] = score

            fout.write(json.dumps(ex) + "\n")
            print("✅ Success")

        except Exception as e:
            err = {
                "prompt_id": ex.get("prompt_id"),
                "gen_id": ex.get("gen_id"),
                "error": str(e),
            }
            ferr.write(json.dumps(err) + "\n")
            print(f"⚠️ Error: {e}")

        time.sleep(1)


Scoring: P1_llama_gen1 ... ✅ Success
Scoring: P1_mistral_gen1 ... ✅ Success
Scoring: P1_llama_gen2 ... ✅ Success
Scoring: P1_mistral_gen2 ... ✅ Success
Scoring: P1_llama_gen3 ... ✅ Success
Scoring: P1_mistral_gen3 ... ✅ Success
Scoring: P2_llama_gen1 ... ✅ Success
Scoring: P2_mistral_gen1 ... ✅ Success
Scoring: P2_llama_gen2 ... ✅ Success
Scoring: P2_mistral_gen2 ... ✅ Success
Scoring: P2_llama_gen3 ... ✅ Success
Scoring: P2_mistral_gen3 ... ✅ Success
Scoring: P3_llama_gen1 ... ✅ Success
Scoring: P3_mistral_gen1 ... ✅ Success
Scoring: P3_llama_gen2 ... ✅ Success
Scoring: P3_mistral_gen2 ... ⚠️ Error: GPT-4o failed to grade: Expecting value: line 1 column 1 (char 0)
Scoring: P3_llama_gen3 ... ✅ Success
Scoring: P3_mistral_gen3 ... ✅ Success
Scoring: P4_llama_gen1 ... ✅ Success
Scoring: P4_mistral_gen1 ... ✅ Success
Scoring: P4_llama_gen2 ... ⚠️ Error: GPT-4o failed to grade: Expecting value: line 1 column 1 (char 0)
Scoring: P4_mistral_gen2 ... ✅ Success
Scoring: P4_llama_gen3 ... ✅ Suc