In [18]:
import json
import random
import pandas as pd
from collections import Counter, defaultdict
from agents.dataloader import load_dataset_by_name

# Load dataset
name = "webnlg"
data = load_dataset_by_name(name)
dataset = list(data["test"])

structgpt = "results/factual_struct_gpt_base.txt"
struct = "results/factual_struct_gpt_base.json"
e2e = "results/webnlg_e2e.json"
agent = "results/webnlg_agent.json"

Loading dataset: webnlg


In [19]:
def sample_indices_by_category_and_triple_size(dataset, total_samples=105, seed=42):
    # Exclude triple size 1
    filtered = [x for x in dataset if len(x.get("triples", x.get("input", []))) > 1]
    # Count by (category, triple size)
    strat_counts = Counter((x['category'], len(x.get("triples", x.get("input", [])))) for x in filtered)
    by_strat = defaultdict(list)
    for x in filtered:
        strat = (x['category'], len(x.get("triples", x.get("input", []))))
        by_strat[strat].append(x)
    # Proportional allocation
    total = len(filtered)
    strat_samples = {strat: max(1, int(total_samples * count / total)) for strat, count in strat_counts.items()}
    random.seed(seed)
    sampled = []
    for strat, items in by_strat.items():
        n = strat_samples[strat]
        random.shuffle(items)
        sampled.extend(items[:n])
    # Deduplicate and fill up to total_samples
    seen = set()
    unique_sampled = []
    for x in sampled:
        if x["id"] not in seen:
            unique_sampled.append(x)
            seen.add(x["id"])
    sampled = unique_sampled
    remaining = [x for x in filtered if x["id"] not in seen]
    needed = total_samples - len(sampled)
    if needed > 0 and len(remaining) >= needed:
        sampled.extend(random.sample(remaining, needed))
    elif needed > 0:
        sampled.extend(remaining)
    if len(sampled) > total_samples:
        sampled = random.sample(sampled, total_samples)
    return [item["id"] for item in sampled]


In [20]:
# def sample_indices_by_category(dataset, total_samples=105, seed=42):
#     # Exclude triple size 1
#     filtered = [x for x in dataset if len(x.get("triples", x.get("input", []))) > 1]
#     category_counts = Counter(x['category'] for x in filtered)
#     # Floor division for initial allocation
#     category_samples = {
#         cat: max(1, int(total_samples * count / len(filtered)))
#         for cat, count in category_counts.items()
#     }
#     by_category = defaultdict(list)
#     for x in filtered:
#         by_category[x['category']].append(x)
#     random.seed(seed)
#     sampled = []
#     # Category-based sampling
#     for cat, items in by_category.items():
#         n = category_samples[cat]
#         random.shuffle(items)
#         sampled.extend(items[:n])
#     # Remove duplicates if any (just in case)
#     seen = set()
#     unique_sampled = []
#     for x in sampled:
#         if x["id"] not in seen:
#             unique_sampled.append(x)
#             seen.add(x["id"])
#     sampled = unique_sampled
#     # Fill up to total_samples
#     remaining = [x for x in filtered if x["id"] not in seen]
#     needed = total_samples - len(sampled)
#     if needed > 0:
#         sampled.extend(random.sample(remaining, needed))
#     # If oversampled, randomly drop extras
#     if len(sampled) > total_samples:
#         sampled = random.sample(sampled, total_samples)
#     return [item["id"] for item in sampled]


def load_json_preds(file_path):
    preds = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            preds[entry["index"]] = entry["prediction"]
    return preds

def build_rows(indices, dataset, e2e_preds, agent_preds, struct_lines, columns):
    rows = []
    for idx in indices:
        item = next(x for x in dataset if x["id"] == idx)
        triples = "\n".join(item.get("triples", item.get("input", [])))
        # Reference
        ref = random.choice(item.get("references", []))
        rows.append({
            "Index": item["id"],
            "Model": "reference",
            "Triples": triples,
            "Output": ref,
            "Fluency": "",
            "Grammaticality": "",
            "Addition": "",
            "Omission": ""
        })
        # E2E
        if item["id"] in e2e_preds:
            rows.append({
                "Index": item["id"],
                "Model": "E2E",
                "Triples": triples,
                "Output": e2e_preds[item["id"]],
                "Fluency": "",
                "Grammaticality": "",
                "Addition": "",
                "Omission": ""
            })
        # AGENT
        if item["id"] in agent_preds:
            rows.append({
                "Index": item["id"],
                "Model": "AGENT",
                "Triples": triples,
                "Output": agent_preds[item["id"]],
                "Fluency": "",
                "Grammaticality": "",
                "Addition": "",
                "Omission": ""
            })
        # StructGPT4
        if item["id"] < len(struct_lines):
            rows.append({
                "Index": item["id"],
                "Model": "StructGPT4",
                "Triples": triples,
                "Output": struct_lines[item["id"]],
                "Fluency": "",
                "Grammaticality": "",
                "Addition": "",
                "Omission": ""
            })
    return pd.DataFrame(rows, columns=columns)

In [21]:
# Sample indices
all_sampled_indices = sample_indices_by_category(dataset, total_samples=105, seed=42)
pilot_indices = all_sampled_indices[:5]
human_eval_indices = all_sampled_indices[5:105]

# Load predictions
e2e_preds = load_json_preds(e2e)
agent_preds = load_json_preds(agent)
with open(structgpt, "r", encoding="utf-8") as f:
    struct_lines = [line.strip() for line in f.readlines()]

# Define columns
columns = ["Index", "Model", "Triples", "Output", "Fluency", "Grammaticality", "Addition", "Omission"]

# Build DataFrames
df_pilot = build_rows(pilot_indices, dataset, e2e_preds, agent_preds, struct_lines, columns)
df_eval = build_rows(human_eval_indices, dataset, e2e_preds, agent_preds, struct_lines, columns)
guidelines_columns = ["Criteria", "Definition", "Evaluation Points", "Scoring Scales", "Examples Triples", "Example Output", "Score", "Notes"]
df_blank = pd.DataFrame(columns=guidelines_columns)

# Save to Excel
with pd.ExcelWriter("webnlg_human_eval_study.xlsx") as writer:
    df_blank.to_excel(writer, sheet_name="Evaluation Guidelines", index=False)
    df_pilot.to_excel(writer, sheet_name="Pilot Study", index=False)
    df_eval.to_excel(writer, sheet_name="Human Evaluation", index=False)
    
    
print("Excel file created: webnlg_human_eval_study.xlsx")
print(f"Total examples in test set: {len(dataset)}")
filtered = [x for x in dataset if len(x.get("triples", x.get("input", []))) > 1]
print(f"Total after removing triple size 1: {len(filtered)}")
print(f"Total sampled indices: {len(all_sampled_indices)}")
print(f"Pilot indices: {len(pilot_indices)}")
print(f"Human evaluation indices: {len(human_eval_indices)}")

Excel file created: webnlg_human_eval_study.xlsx
Total examples in test set: 1779
Total after removing triple size 1: 1410
Total sampled indices: 105
Pilot indices: 5
Human evaluation indices: 100


In [None]:
# import json
# from agents.dataloader import load_dataset_by_name

# # Load WebNLG test set
# name = "webnlg"
# data = load_dataset_by_name(name)
# dataset = list(data["test"])

# Load predictions
with open(structgpt, "r", encoding="utf-8") as f:
    predictions = [line.strip() for line in f.readlines()]

# Build json objects
results = []
for i, item in enumerate(dataset):
    output = predictions[i] if i < len(predictions) else ""
    result = {
        "index": item["id"],
        # "triples": item.get("triples", []),
        "prediction": output
    }
    results.append(result)

# Save as JSON
with open("results/factual_struct_gpt_base.json", "w", encoding="utf-8") as f:
    # json.dump(results, f, ensure_ascii=False, indent=2)
    for result in results:
        f.write(json.dumps(result, ensure_ascii=False) + "\n")

print("Done! Saved to results/factual_struct_gpt_base.json")


Loading dataset: webnlg
Done! Saved to results/factual_struct_gpt_base.json


In [32]:
e2e_preds = load_json_preds(e2e)
agent_preds = load_json_preds(agent)
struct_preds = load_json_preds(struct)

def load_texts(file_path, text_dict):
    """
    Save each value from a dict as a line in a .txt file.
    """
    txt_path = file_path.replace(".json", "")
    with open(txt_path, "w", encoding="utf-8") as f:
        for text in text_dict.values():
            text = text.replace("\n", " ").strip()
            f.write(str(text) + "\n")
    print(f"Texts saved to {txt_path}")

load_texts(e2e, e2e_preds)
load_texts(agent, agent_preds)
load_texts(struct, struct_preds)


Texts saved to results/webnlg_e2e
Texts saved to results/webnlg_agent
Texts saved to results/factual_struct_gpt_base


In [41]:
dataset[0]

{'id': 0,
 'triples': ['Estádio_Municipal_Coaracy_da_Mata_Fonseca | location | Arapiraca',
  'Agremiação_Sportiva_Arapiraquense | league | Campeonato_Brasileiro_Série_C',
  'Campeonato_Brasileiro_Série_C | country | Brazil',
  'Agremiação_Sportiva_Arapiraquense | nickname | "\'\'Alvinegro"',
  'Agremiação_Sportiva_Arapiraquense | ground | Estádio_Municipal_Coaracy_da_Mata_Fonseca'],
 'num_triples': '5',
 'category': 'SportsTeam',
 'references': ['Estádio Municipal Coaracy da Mata Fonseca is the name of the ground of Agremiação Sportiva Arapiraquense in Arapiraca. Agremiação Sportiva Arapiraquense, nicknamed "Alvinegro", lay in the Campeonato Brasileiro Série C league from Brazil.',
  'Estádio Municipal Coaracy da Mata Fonseca is the name of the ground of Agremiação Sportiva Arapiraquense in Arapiraca. Alvinegro, the nickname of Agremiação Sportiva Arapiraquense, play in the Campeonato Brasileiro Série C league from Brazil.']}

In [44]:
import json
from agents.dataloader import load_dataset_by_name

# Load WebNLG test set
name = "webnlg_hf"
data = load_dataset_by_name(name)
dataset = list(data["test"])

with open("results/triples", "w", encoding="utf-8") as f:
    for item in dataset:
        triples = item.get("triples", None)
        # Prefer the 'triples' field; fallback to 'input'
        if not triples:
            triples = item.get("input", None)
        if triples:
            # If triples is a list, join with comma and space
            if isinstance(triples, list):
                line = ", ".join(str(t).replace("\n", " ").strip() for t in triples)
            else:  # If already string, just clean it up
                line = str(triples).replace("\n", " ").strip()
            f.write(line + "\n")


Loading dataset: webnlg_hf


In [4]:
# # Convert your dataset to a list if not already done
# all_examples = list(dataset)

# # 1. Count examples per category
# category_counts = Counter(x['category'] for x in all_examples)

# # 2. Proportional allocation of 105 samples across categories
# total_samples = 105
# category_samples = {
#     cat: max(1, round(total_samples * count / len(all_examples)))
#     for cat, count in category_counts.items()
# }

# # 3. Group by category
# by_category = defaultdict(list)
# for x in all_examples:
#     by_category[x['category']].append(x)

# # 4. Randomly sample from each category
# random.seed(42)
# sampled = []
# for cat, items in by_category.items():
#     n = category_samples[cat]
#     random.shuffle(items)
#     sampled.extend(items[:n])

# # 5. If oversampled, randomly drop extra to get exactly 105
# if len(sampled) > total_samples:
#     sampled = random.sample(sampled, total_samples)

# # 6. Get the indices (IDs) for these 105 samples
# all_sampled_indices = [item["id"] for item in sampled]

# # 7. Divide into pilot (first 5) and human evaluation (next 100)
# pilot_indices = all_sampled_indices[:5]
# human_eval_indices = all_sampled_indices[5:105]

# columns = ["Index", "Model", "Triples", "Output", "Fluency", "Grammaticality", "Addition", "Omission"]
# print(f"pilot_indices: {pilot_indices}")
# print(f"human_eval_indices: {human_eval_indices}")

In [5]:
# # --- Load E2E and Agent predictions ---
# def load_jsonl(file_path):
#     preds = {}
#     with open(file_path, "r", encoding="utf-8") as f:
#         for line in f:
#             entry = json.loads(line)
#             preds[entry["index"]] = entry["prediction"]
#     return preds

# e2e_preds = load_jsonl("results/webnlg_e2e.json") 
# agent_preds = load_jsonl("results/webnlg_agent.json")

# # --- Load StructGPT4 predictions ---
# with open("results/factual_struct_gpt_base.txt", "r", encoding="utf-8") as f:
#     struct_lines = [line.strip() for line in f.readlines()]

In [6]:
# pilot_rows = []
# for idx in pilot_indices:
#     item = dataset[idx]
#     triples = "\n".join(item.get("triples", item.get("input", [])))
#     # Reference row
#     ref = random.choice(item.get("references", []))
#     pilot_rows.append({
#         "Index": item["id"],
#         "Model": "reference",
#         "Triples": triples,
#         "Output": ref,
#         "Fluency": "",
#         "Grammaticality": "",
#         "Addition": "",
#         "Omission": ""
#     })
#     # E2E row
#     if item["id"] in e2e_preds:
#         pilot_rows.append({
#             "Index": item["id"],
#             "Model": "E2E",
#             "Triples": triples,
#             "Output": e2e_preds[item["id"]],
#             "Fluency": "",
#             "Grammaticality": "",
#             "Addition": "",
#             "Omission": ""
#         })
#         # AGENT row
#     if item["id"] in e2e_preds:
#         pilot_rows.append({
#             "Index": item["id"],
#             "Model": "E2E",
#             "Triples": triples,
#             "Output": agent_preds[item["id"]],
#             "Fluency": "",
#             "Grammaticality": "",
#             "Addition": "",
#             "Omission": ""
#         })
#     # StructGPT4 row (check if line exists)
#     if item["id"] < len(struct_lines):
#         pilot_rows.append({
#             "Index": item["id"],
#             "Model": "StructGPT4",
#             "Triples": triples,
#             "Output": struct_lines[item["id"]],
#             "Fluency": "",
#             "Grammaticality": "",
#             "Addition": "",
#             "Omission": ""
#         })

# df_pilot = pd.DataFrame(pilot_rows, columns=columns)


In [7]:
# # Save or append as a sheet in your Excel
# eval_rows = []
# for idx in human_eval_indices:
#     item = dataset[idx]
#     triples = "\n".join(item.get("triples", item.get("input", [])))
#     # 1. Reference
#     reference_text = random.choice(item.get("references", []))
#     eval_rows.append({
#         "Index": item["id"],
#         "Model": "reference",
#         "Triples": triples,
#         "Output": reference_text,
#         "Fluency": "",
#         "Grammaticality": "",
#         "Addition": "",
#         "Omission": ""
#     })
#     # 2. E2E
#     if item["id"] in e2e_preds:
#         eval_rows.append({
#             "Index": item["id"],
#             "Model": "E2E",
#             "Triples": triples,
#             "Output": e2e_preds[item["id"]],
#             "Fluency": "",
#             "Grammaticality": "",
#             "Addition": "",
#             "Omission": ""
#         })
#     # 3. StructGPT4
#     if item["id"] < len(struct_lines):
#         eval_rows.append({
#             "Index": item["id"],
#             "Model": "StructGPT4",
#             "Triples": triples,
#             "Output": struct_lines[item["id"]],
#             "Fluency": "",
#             "Grammaticality": "",
#             "Addition": "",
#             "Omission": ""
#         })
#     # 4. AGENT (reference blank)
#     eval_rows.append({
#         "Index": item["id"],
#         "Model": "AGENT",
#         "Triples": triples,
#         "Output": agent_preds[item["id"]],
#         "Fluency": "",
#         "Grammaticality": "",
#         "Addition": "",
#         "Omission": ""
#     })

# df_eval = pd.DataFrame(eval_rows, columns=columns)

# guidlines_columns = ["Criteria", "Definition", "Evaluation Points", "Scoring Scales", "Examples Triples", "Example Output", "Score", "Notes"]
# df_blank = pd.DataFrame(columns=guidlines_columns)

# # # Save or append as a sheet in your Excel
# # with pd.ExcelWriter("webnlg_human_eval_study.xlsx", mode="a", if_sheet_exists="replace") as writer:
# #     df_eval.to_excel(writer, sheet_name="Human Evaluation", index=False)

# # print("Human Evaluation sheet populated with all models and references (AGENT left blank for reference).")


In [8]:
# with pd.ExcelWriter("webnlg_human_eval_study.xlsx") as writer:
#     df_blank.to_excel(writer, sheet_name="Evluation Guidelines", index=False)
#     df_pilot.to_excel(writer, sheet_name="Pilot Study", index=False)
#     df_eval.to_excel(writer, sheet_name="Human Evaluation", index=False)
# print("Excel file created: webnlg_human_eval_study.xlsx")
