## Annotation Datasets for Evaluation of LLM: GPT otuput
- Dataframe Format: cid, text, assigned, label, annotation, mode, topic

- Point: Check whether LLM-assigned labels (Politics, Waste)for individual sampled posts are correct.

- 6 different sets: for each mode, set_A, set_B, agreement set (2x3), 400 total

- hides assigned_label as not to influence the decision. 

- Used for further Inter-Annotator Agreement (Kappa score)...

In [17]:
import pandas as pd
import random
from pathlib import Path
from collections import defaultdict


BASE_PATH = Path("../../../DS_BachelorProject_PH/data/llm_subtopic/datasets")

MULTI_PATH = BASE_PATH / "BERTopic_posts_with_labels_multi_label.json"
SINGLE_PATH = BASE_PATH / "BERTopic_posts_with_labels_single_label.json"
OUT_DIR = BASE_PATH / "annotations"
OUT_DIR.mkdir(parents=True, exist_ok=True)


# --- Parameters ---
labels_to_sample = [
    "Politics", "Renewable", "Nature", "Activism", "Fossil", "Waste", 
    "Lifestyle", "Weather", "Disaster", "Agriculture", "Transportation",
    "Electricity", "Construction", "Climate", "Technology"
]

samples_per_label = 25
agreement_size = 50

# --- Load JSON data ---
df_multi = pd.read_json(MULTI_PATH)
df_single = pd.read_json(SINGLE_PATH)
df_multi["mode"] = "multi_label"
df_single["mode"] = "single_label"

# --- Helper: stratified sampling by assigned_label ---
def stratify_samples(df):
    label_buckets = defaultdict(list)
    for _, row in df.iterrows():
        lbl = row["assigned_label"]
        if lbl in labels_to_sample:
            label_buckets[lbl].append(row)

    stratified = []
    for label, rows in label_buckets.items():
        n = min(samples_per_label, len(rows))
        stratified.extend(random.sample(rows, n))
    return pd.DataFrame(stratified)

# --- Stratify datasets ---
df_multi_sampled = stratify_samples(df_multi)
df_single_sampled = stratify_samples(df_single)

# --- Build agreement set (25 from each mode) ---
df_agreement_multi = df_multi_sampled.sample(n=agreement_size // 2, random_state=42)
df_agreement_single = df_single_sampled.sample(n=agreement_size // 2, random_state=43)

# --- Remove from main sets to avoid duplication ---
df_multi_sampled = df_multi_sampled.drop(df_agreement_multi.index)
df_single_sampled = df_single_sampled.drop(df_agreement_single.index)

# --- Split into A/B annotators ---
def split_annotators(df, seed):
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    half = len(df) // 2
    return df.iloc[:half], df.iloc[half:]

multi_A, multi_B = split_annotators(df_multi_sampled, seed=1)
single_A, single_B = split_annotators(df_single_sampled, seed=2)

# --- Add agreement samples to both annotators ---
multi_A = pd.concat([multi_A, df_agreement_multi], ignore_index=True)
multi_B = pd.concat([multi_B, df_agreement_multi], ignore_index=True)
single_A = pd.concat([single_A, df_agreement_single], ignore_index=True)
single_B = pd.concat([single_B, df_agreement_single], ignore_index=True)

def format_for_annotation(df, task_type):
    df_out = df[["cid", "text", "topic", "mode"]].copy()
    df_out["annotation"] = ""
    df_out["notes"] = ""
    if task_type == "multi":
        df_out["instruction"] = "Correct / Partial / Incorrect"
    else:
        df_out["instruction"] = "Correct / Incorrect"
    return df_out

# JaySon
format_for_annotation(multi_A, "multi").to_json(OUT_DIR / "multi_label_set_Abel.csv", index=False)
format_for_annotation(multi_B, "multi").to_json(OUT_DIR / "multi_label_set_Tobias.csv", index=False)
format_for_annotation(df_agreement_multi, "multi").to_csv(OUT_DIR / "multi_label_agreement.csv", index=False)

format_for_annotation(single_A, "single").to_json(OUT_DIR / "single_label_set_A.csv", index=False)
format_for_annotation(single_B, "single").to_json(OUT_DIR / "single_label_set_B.csv", index=False)
format_for_annotation(df_agreement_single, "single").to_json(OUT_DIR / "single_label_agreement.csv", index=False)

print("Annotation sets generated in:", OUT_DIR)


Annotation sets generated in: ../../../DS_BachelorProject_PH/data/llm_subtopic/datasets/annotations
