In [1]:
# STEP 2: Import required modules
import json
import pandas as pd

# STEP 3: Load all three JSON files
with open("basemodel_evaluation_data.json", "r") as f1, open("rag_responses_vanilla_evaluation_data.json", "r") as f2, open("rag_responses_prompt_evaluation_data.json", "r") as f3:
    basic_data = json.load(f1)
    rag_data = json.load(f2)
    rag_prompt_data = json.load(f3)

# STEP 4: Sanity check — all must have same number of questions
assert len(basic_data) == len(rag_data) == len(rag_prompt_data), "Mismatch in number of questions."

# STEP 5: Combine into DataFrame for 3-way comparison
merged_rows = []
for b, r, rp in zip(basic_data, rag_data, rag_prompt_data):
    merged_rows.append({
        "question": b["question"],  # all 3 should match if aligned
        "answer_a": b["answer"],
        "answer_b": r["answer"],
        "answer_c": rp["answer"],
        "system_a": "Basic",
        "system_b": "RAG",
        "system_c": "RAG+Prompt"
    })

df = pd.DataFrame(merged_rows)

# STEP 6: Save CSV
df.to_csv("LLM three_way_answers.csv", index=False)
print("✅ CSV created: LLM_three_way_answers.csv")



✅ CSV created: LLM_three_way_answers.csv


In [4]:
# STEP 1: Install and import packages
!pip install --upgrade openai pandas

import openai
import pandas as pd

# STEP 2: Set your OpenAI API key
openai.api_key = "sk-proj-"

# STEP 3: Load CSV with format: question, answer_a, answer_b, answer_c, system_a, system_b, system_c
input_path = "LLM three_way_answers.csv"
df = pd.read_csv(input_path)

# STEP 4: Define precision-focused 3-way judge function
def judge_best_of_three(question, answer_a, answer_b, answer_c):
    prompt = f"""
You are an impartial evaluator. Given a question and three answers (A, B, and C), choose the one that provides the most precise, accurate, and specific factual information. Avoid generic or vague answers. Assume the question is being asked by someone who wants a clear, concrete response and already understands the context (e.g., the university, country, or program involved).

Question: {question}

Answer A (from {row['system_a']}): {answer_a}
Answer B (from {row['system_b']}): {answer_b}
Answer C (from {row['system_c']}): {answer_c}

Which answer is best? Respond only with "A", "B", or "C" and provide a short justification based strictly on precision and factual specificity.
"""

    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful and impartial evaluator."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2
    )

    return response.choices[0].message.content.strip()

# STEP 5: Run judgment
judgments = []
winners = []

for index, row in df.iterrows():
    judgment = judge_best_of_three(row['question'], row['answer_a'], row['answer_b'], row['answer_c'])
    judgments.append(judgment)

    winner_letter = judgment.strip()[0]  # First character A/B/C
    winner = row[f'system_{winner_letter.lower()}'] if winner_letter in ['A', 'B', 'C'] else "Unclear"
    winners.append(winner)

# STEP 6: Add results to DataFrame
df['judgment'] = judgments
df['winner_system'] = winners

# STEP 7: Save results
output_path = "three_way_judged_output.csv"
df.to_csv(output_path, index=False)
print(f"✅ Judging complete. Results saved to {output_path}")




AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}