In [40]:
import pandas as pd
import random
import json

# Load dataset
from agents.dataloader import load_dataset_by_name
name = "webnlg"
data = load_dataset_by_name(name)
dataset = list(data["test"])

Loading dataset: webnlg


In [41]:
import random
from collections import Counter, defaultdict

# Convert your dataset to a list if not already done
all_examples = list(dataset)

# 1. Count examples per category
category_counts = Counter(x['category'] for x in all_examples)

# 2. Proportional allocation of 105 samples across categories
total_samples = 105
category_samples = {
    cat: max(1, round(total_samples * count / len(all_examples)))
    for cat, count in category_counts.items()
}

# 3. Group by category
by_category = defaultdict(list)
for x in all_examples:
    by_category[x['category']].append(x)

# 4. Randomly sample from each category
random.seed(42)
sampled = []
for cat, items in by_category.items():
    n = category_samples[cat]
    random.shuffle(items)
    sampled.extend(items[:n])

# 5. If oversampled, randomly drop extra to get exactly 105
if len(sampled) > total_samples:
    sampled = random.sample(sampled, total_samples)

# 6. Get the indices (IDs) for these 105 samples
all_sampled_indices = [item["id"] for item in sampled]

# 7. Divide into pilot (first 5) and human evaluation (next 100)
pilot_indices = all_sampled_indices[:5]
human_eval_indices = all_sampled_indices[5:105]


In [42]:
# 1. Load E2E predictions
e2e_preds = {}
with open("results/webnlg_e2e.json", "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        e2e_preds[entry["index"]] = entry["prediction"]

# 2. Load StructGPT4 predictions
with open("results/factual_struct_gpt_base.txt", "r", encoding="utf-8") as f:
    struct_lines = [line.strip() for line in f.readlines()]

pilot_rows = []
for idx in pilot_indices:
    item = dataset[idx]
    triples = "\n".join(item.get("triples", item.get("input", [])))
    # Reference row
    ref = random.choice(item.get("references", []))
    pilot_rows.append({
        "index": item["id"],
        "model": "reference",
        "triples": triples,
        "reference": ref,
        "Fluency": "",
        "Grammaticality": "",
        "Addition": "",
        "Omission": ""
    })
    # E2E row
    if item["id"] in e2e_preds:
        pilot_rows.append({
            "index": item["id"],
            "model": "E2E",
            "triples": triples,
            "reference": e2e_preds[item["id"]],
            "Fluency": "",
            "Grammaticality": "",
            "Addition": "",
            "Omission": ""
        })
    # StructGPT4 row (check if line exists)
    if item["id"] < len(struct_lines):
        pilot_rows.append({
            "index": item["id"],
            "model": "StructGPT4",
            "triples": triples,
            "reference": struct_lines[item["id"]],
            "Fluency": "",
            "Grammaticality": "",
            "Addition": "",
            "Omission": ""
        })

df_pilot = pd.DataFrame(pilot_rows, columns=columns)


In [43]:
import pandas as pd
import random
import json

# ---- Load or reuse dataset, human_eval_indices, e2e_preds, struct_lines ----
# (Reuse as in previous cells, or define as needed)

columns = ["index", "model", "triples", "reference", "Fluency", "Grammaticality", "Addition", "Omission"]

eval_rows = []
for idx in human_eval_indices:
    item = dataset[idx]
    triples = "\n".join(item.get("triples", item.get("input", [])))
    # 1. Reference
    reference_text = random.choice(item.get("references", []))
    eval_rows.append({
        "index": item["id"],
        "model": "reference",
        "triples": triples,
        "reference": reference_text,
        "Fluency": "",
        "Grammaticality": "",
        "Addition": "",
        "Omission": ""
    })
    # 2. E2E
    if item["id"] in e2e_preds:
        eval_rows.append({
            "index": item["id"],
            "model": "E2E",
            "triples": triples,
            "reference": e2e_preds[item["id"]],
            "Fluency": "",
            "Grammaticality": "",
            "Addition": "",
            "Omission": ""
        })
    # 3. StructGPT4
    if item["id"] < len(struct_lines):
        eval_rows.append({
            "index": item["id"],
            "model": "StructGPT4",
            "triples": triples,
            "reference": struct_lines[item["id"]],
            "Fluency": "",
            "Grammaticality": "",
            "Addition": "",
            "Omission": ""
        })
    # 4. AGENT (reference blank)
    eval_rows.append({
        "index": item["id"],
        "model": "AGENT",
        "triples": triples,
        "reference": "",
        "Fluency": "",
        "Grammaticality": "",
        "Addition": "",
        "Omission": ""
    })

df_eval = pd.DataFrame(eval_rows, columns=columns)

# Save or append as a sheet in your Excel
with pd.ExcelWriter("webnlg_human_eval_study.xlsx", mode="a", if_sheet_exists="replace") as writer:
    df_eval.to_excel(writer, sheet_name="Human Evaluation", index=False)

print("Human Evaluation sheet populated with all models and references (AGENT left blank for reference).")


Human Evaluation sheet populated with all models and references (AGENT left blank for reference).


In [44]:
with pd.ExcelWriter("webnlg_human_eval_study.xlsx") as writer:
    df_blank.to_excel(writer, sheet_name="Sheet1", index=False)
    df_pilot.to_excel(writer, sheet_name="Pilot Study", index=False)
    df_eval.to_excel(writer, sheet_name="Human Evaluation", index=False)
print("Excel file created: webnlg_human_eval_study.xlsx")


Excel file created: webnlg_human_eval_study.xlsx
