In [1]:
import json
import random

In [2]:
def clean(text):
    return " ".join(text.strip().lower().split())

In [3]:
# === Load test.jsonl (Qasper test set) ===
with open("raw_dataset/test.jsonl", "r") as f:
    papers = [json.loads(line) for line in f]

In [4]:
import re

def clean_output(text):
    lines = text.strip().splitlines()

    # Remove first line if it looks like a repeated question
    if lines and lines[0].strip().endswith("?"):
        lines = lines[1:]

    # Rejoin and normalize whitespace
    cleaned = " ".join(lines).strip()
    cleaned = re.sub(r"[ \t\r\n]+", " ", cleaned)

    return cleaned

In [5]:
# === Load QA predictions ===
qa_preds = {}

In [6]:
with open("../qa_finetune/qwen_qa_test_prediction_with_ids_and_questions.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        key = (clean(item["id"]), clean(item["question"]))
        qa_preds[key] = clean_output(item["output"])

In [16]:
# === Load Full-text predictions ===
full_preds = {}
gold_answers = {}
with open("../full_text_finetune/full_text_predictions.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        key = (clean(item["id"]), clean(item["question"]))
        full_preds[key] = clean_output(item["generated_answer"])
        gold_answers[key] = item.get("expected_output", "")

In [17]:
final_dataset = []

In [None]:
for paper in papers:
    paper_id = paper["id"]
    qas = paper["qas"]

    questions = qas["question"]
    answers_list = qas["answers"]

    for i in range(len(questions)):
        question = questions[i]
        answers = answers_list[i]

        # # Extract gold answer
        # if isinstance(answers, list) and len(answers) > 0:
        #     top_answer = answers[0]
        #     spans = top_answer.get("extractive_spans", [])
        #     gold_answer = spans[0] if spans else top_answer.get("free_form_answer", "")
        # else:
        #     gold_answer = ""

        key = (clean(paper_id), clean(question))
        if key not in qa_preds or key not in full_preds:
            continue

        answer_qa = qa_preds[key]
        answer_full = full_preds[key]
        gold_answer = gold_answers[key]


        # Randomize order
        if random.random() < 0.5:
            answer_a, answer_b = answer_qa, answer_full
            label_a, label_b = "QA", "Full"
        else:
            answer_a, answer_b = answer_full, answer_qa
            label_a, label_b = "Full", "QA"

        prompt = f"""
You are evaluating two AI-generated answers to a question about a scientific research paper. The answers may differ in length, formality, or style.

Please focus on the content quality — not formatting (gold_answers might not always be well formatted) — and follow the instructions below.

Question:
{question}

Ground Truth Answer:
{gold_answer}

Answer A:
{answer_a}

Answer B:
{answer_b}

Tasks:
1. Choose the better answer (A, B, or Tie).
2. Rate each answer from 1–5 based on relevance, correctness, and completeness.
3. Briefly comment on how different the two answers are.
4. Briefly comment on how each compares to the ground truth.
""".strip()

        final_dataset.append({
            "paper_id": paper_id,
            "question": question,
            "gold_answer": gold_answer,
            "answer_a": answer_a,
            "answer_b": answer_b,
            "label_a": label_a,
            "label_b": label_b,
            "prompt": prompt
        })

In [19]:
# === Save output ===
with open("llm_eval_input2.json", "w") as f:
    json.dump(final_dataset, f, indent=2)

print(f"✅ Saved {len(final_dataset)} examples to llm_eval_input.json")

✅ Saved 1150 examples to llm_eval_input.json
