# About

Simple baselines for GhostWriter dataset (e.g. linear regression).

In [281]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [314]:
!pip install xgboost lightgbm tabulate

Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [320]:
import re, math, string, numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from IPython.display import display, Markdown
from datasets import DatasetDict, load_dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.utils import check_random_state

rng = check_random_state(42)

In [284]:
# Helpers
_punct_tbl = str.maketrans("", "", "")
punct_set = set(string.punctuation)
url_pat = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
sent_pat = re.compile(r"[.!?]")
nonspace_pat = re.compile(r"\S")
alnum_pat = re.compile(r"[A-Za-z0-9]")

def safe_div(a, b):
    return a / b if b else 0.0

def tokenize_whitespace_strip_punct(text):
    toks = []
    for tok in text.split():
        # strip leading/trailing punctuation
        tok = tok.strip(string.punctuation)
        if tok and alnum_pat.search(tok):
            toks.append(tok)
    return toks

def extract_metrics_batch(batch):
    texts = batch["text"]
    out = {
        "n_chars": [],
        "n_chars_nospace": [],
        "n_words": [],
        "avg_word_len": [],
        "n_sents": [],
        "n_punct": [],
        "punct_ratio": [],
        "upper_ratio": [],
        "digit_ratio": [],
        "url_count": [],
        "type_token_ratio": [],
        "hapax_ratio": [],
    }
    for t in texts:
        t = t if isinstance(t, str) else ""
        n_chars = len(t)
        n_chars_nospace = len(re.findall(r"\S", t))
        n_punct = sum(1 for ch in t if (not ch.isalnum()) and (not ch.isspace()))
        n_upper = sum(1 for ch in t if ch.isupper())
        n_digit = sum(1 for ch in t if ch.isdigit())
        url_count = len(url_pat.findall(t))

        # sentence count (min 1 for non-empty text)
        sent_splits = [s for s in sent_pat.split(t) if s.strip()]
        n_sents = max(1, len(sent_splits)) if t.strip() else 0

        toks = tokenize_whitespace_strip_punct(t)
        n_words = len(toks)
        avg_word_len = safe_div(sum(len(w) for w in toks), n_words)

        # lexical diversity
        toks_lower = [w.lower() for w in toks]
        vocab = set(toks_lower)
        type_token_ratio = safe_div(len(vocab), n_words)

        # hapax legomena ratio
        from collections import Counter
        cnt = Counter(toks_lower)
        hapax_ratio = safe_div(sum(1 for w, c in cnt.items() if c == 1), n_words)

        punct_ratio = safe_div(n_punct, n_chars_nospace)
        upper_ratio = safe_div(n_upper, n_chars_nospace)
        digit_ratio = safe_div(n_digit, n_chars_nospace)

        out["n_chars"].append(n_chars)
        out["n_chars_nospace"].append(n_chars_nospace)
        out["n_words"].append(n_words)
        out["avg_word_len"].append(avg_word_len)
        out["n_sents"].append(n_sents)
        out["n_punct"].append(n_punct)
        out["punct_ratio"].append(punct_ratio)
        out["upper_ratio"].append(upper_ratio)
        out["digit_ratio"].append(digit_ratio)
        out["url_count"].append(url_count)
        out["type_token_ratio"].append(type_token_ratio)
        out["hapax_ratio"].append(hapax_ratio)

    return out

In [285]:
FEATURES = [
    "n_chars","n_chars_nospace","n_words","avg_word_len","n_sents","n_punct",
    "punct_ratio","upper_ratio","digit_ratio","url_count","type_token_ratio","hapax_ratio",
]

## Dataset & Feature Extraction

In [286]:
dataset = load_dataset("TheItCrOw/GhostWriter")

README.md:   0%|          | 0.00/846 [00:00<?, ?B/s]

data/train-00000-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

data/train-00001-of-00005.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

data/train-00002-of-00005.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

data/train-00003-of-00005.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

data/train-00004-of-00005.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

data/eval-00000-of-00001.parquet:   0%|          | 0.00/193M [00:00<?, ?B/s]

data/test-00000-of-00002.parquet:   0%|          | 0.00/197M [00:00<?, ?B/s]

data/test-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/382535 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/54648 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/109296 [00:00<?, ? examples/s]

In [299]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 382535
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 54648
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 109296
    })
})

In [None]:
# Remove empty texts
dataset01 = DatasetDict({
    split: ds.filter(lambda ex: isinstance(ex["text"], str) and ex["text"].strip() != "")
    for split, ds in dataset.items()
})

def clean(batch):
    cleaned = []
    for t in batch["text"]:
        # Collapse multiple spaces and trim
        t = re.sub(r"\s+", " ", t).strip()
        cleaned.append(t)
    return {"text": cleaned}

dataset01 = DatasetDict({
    split: ds.map(clean, batched=True, num_proc=32)
    for split, ds in dataset01.items()
})


Filter:   0%|          | 0/382535 [00:00<?, ? examples/s]

Filter:   0%|          | 0/54648 [00:00<?, ? examples/s]

Filter:   0%|          | 0/109296 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/382520 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/54647 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/109292 [00:00<?, ? examples/s]

In [None]:
# Feature extraction 
dataset_feats = DatasetDict({
    split: ds.map(extract_metrics_batch, batched=True, num_proc=32)
    for split, ds in dataset01.items()
})
dataset_feats['train'][0]

Map (num_proc=32):   0%|          | 0/382520 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/54647 [00:00<?, ? examples/s]

Map (num_proc=32):   0%|          | 0/109292 [00:00<?, ? examples/s]

{'id': '41a1cb62-9edd-4aa2-8083-e26165fcc9c2',
 'text': 'In today’s world, technology has woven itself into the very fabric of our daily lives, and its impact on understanding human emotions is both fascinating and concerning. As someone who has grown up with smartphones and social media, I can’t help but wonder how this tech shapes our ability to recognize genuine feelings, especially in educational settings. For instance, tools that analyze facial expressions or tone of voice can be incredibly useful for teachers trying to gauge student engagement. However, I often feel that relying too heavily on these technologies can lead to misunderstandings. Emotions are complex and nuanced; a smile doesn’t always mean happiness, and a frown doesn’t always signify sadness. By placing too much trust in algorithms, we risk oversimplifying the rich tapestry of human emotion. I believe that while technology can aid our understanding, it shouldn’t replace our instinctual ability to connect with other

In [None]:
# Prepare arrays
def to_xy(ds):
    X = np.column_stack([np.array(ds[f], dtype=np.float32) for f in FEATURES])
    y = np.array(ds["label"], dtype=np.int64)
    return X, y

In [291]:
X_train, y_train = to_xy(dataset_feats["train"])
X_test,  y_test  = to_xy(dataset_feats["test"])

print("Train size:", X_train.shape, " Test size:", X_test.shape)
print("Class balance (train):", {c:int((y_train==c).sum()) for c in [0,1]})
print("Class balance (test):",  {c:int((y_test==c).sum()) for c in [0,1]})

Train size: (382520, 12)  Test size: (109292, 12)
Class balance (train): {0: 104303, 1: 151962}
Class balance (test): {0: 29801, 1: 43418}


## Models

In [306]:
NUM_CLASSES = 3
LABELS = [0, 1, 2]

In [None]:
def _safe_tpr_fpr(cm):
    # We'll compute FPR_i via positives predicted for class i vs true negatives.
    tp = np.diag(cm).astype(float)
    row_sum = cm.sum(axis=1).astype(float)
    col_sum = cm.sum(axis=0).astype(float)
    total = cm.sum().astype(float)

    tpr = np.divide(tp, np.maximum(row_sum, 1), out=np.zeros_like(tp), where=row_sum>0)

    fp = col_sum - tp
    tn = total - row_sum - col_sum + tp
    denom = fp + tn
    fpr = np.divide(fp, np.maximum(denom, 1), out=np.zeros_like(fp), where=denom>0)

    return tpr, fpr

def compute_metrics_multiclass(y_true, y_pred, y_proba):
    """
    Computes macro F1, macro AUROC, per-class TPR/FPR, and macro TPR/FPR averages.
    """
    f1_macro = f1_score(y_true, y_pred, labels=LABELS, average='macro', zero_division=0)

    # AUROC (macro one-vs-rest)
    uniq = np.unique(y_true)
    if len(uniq) >= 2:
        y_true_bin = label_binarize(y_true, classes=LABELS)
        auroc_macro = roc_auc_score(y_true_bin, y_proba, average='macro', multi_class='ovr')
    else:
        auroc_macro = float('nan')

    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    tpr, fpr = _safe_tpr_fpr(cm)

    # Macro-averaged TPR/FPR
    tpr_macro = float(np.mean(tpr))
    fpr_macro = float(np.mean(fpr))

    # Flatten per-class metrics into a dict
    per_class = {}
    for i, lab in enumerate(LABELS):
        per_class[f"TPR_{lab}"] = float(tpr[i])
        per_class[f"FPR_{lab}"] = float(fpr[i])

    out = {
        "F1_macro": float(f1_macro),
        "AUROC_macro": float(auroc_macro),
        "TPR_macro": tpr_macro,
        "FPR_macro": fpr_macro,
    }
    out.update(per_class)
    return out

def evaluate_by_group(y_true, y_pred, y_proba, group_values, min_samples=50):
    """
    Returns a pandas DataFrame with metrics per group.
    - Skips AUROC for groups that contain <2 classes (sets AUROC_macro=nan)
    - Drops groups with < min_samples rows
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_proba = np.asarray(y_proba)
    group_values = np.asarray(group_values)

    rows = []
    for g in np.unique(group_values):
        idx = np.where(group_values == g)[0]
        if idx.size < min_samples:
            continue

        yt = y_true[idx]
        yp = y_pred[idx]
        probs = y_proba[idx]

        metrics = compute_metrics_multiclass(yt, yp, probs)
        # Also include size and per-class counts for context
        counts = {f"count_label_{lab}": int((yt == lab).sum()) for lab in LABELS}
        row = {"group": g, "n": int(idx.size)}
        row.update(counts)
        row.update(metrics)
        rows.append(row)

    if not rows:
        return pd.DataFrame(columns=["group", "n"] + [f"count_label_{l}" for l in LABELS]
                                      + ["F1_macro", "AUROC_macro"]
                                      + [f"TPR_{l}" for l in LABELS] + [f"FPR_{l}" for l in LABELS])
    df = pd.DataFrame(rows).sort_values("n", ascending=False).reset_index(drop=True)
    return df

In [322]:
test_domains = np.array(dataset01["test"]["domain"])
test_agents  = np.array(dataset01["test"]["agent"])

results = {}

In [323]:
# Logistic Regression (multinomial)
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, multi_class="multinomial"))
])
logreg.fit(X_train, y_train)
logreg_pred  = logreg.predict(X_test)
logreg_proba = logreg.predict_proba(X_test)
results["LogReg_overall"] = compute_metrics_multiclass(y_test, logreg_pred, logreg_proba)
logreg_by_domain = evaluate_by_group(y_test, logreg_pred, logreg_proba, test_domains, min_samples=50)
logreg_by_agent  = evaluate_by_group(y_test, logreg_pred, logreg_proba, test_agents,  min_samples=50)



In [324]:
# XGBoost (softmax)
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=NUM_CLASSES,
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42,
)
xgb.fit(X_train, y_train)
xgb_proba = xgb.predict_proba(X_test)
xgb_pred  = np.argmax(xgb_proba, axis=1)
results["XGBoost_overall"] = compute_metrics_multiclass(y_test, xgb_pred, xgb_proba)
xgb_by_domain = evaluate_by_group(y_test, xgb_pred, xgb_proba, test_domains, min_samples=50)
xgb_by_agent  = evaluate_by_group(y_test, xgb_pred, xgb_proba, test_agents,  min_samples=50)



In [325]:
# LightGBM (multiclass)
lgbm = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multiclass",
    num_class=NUM_CLASSES,
    random_state=42,
    n_jobs=-1,
)
lgbm.fit(X_train, y_train)
lgbm_proba = lgbm.predict_proba(X_test)
lgbm_pred  = np.argmax(lgbm_proba, axis=1)
results["LightGBM_overall"] = compute_metrics_multiclass(y_test, lgbm_pred, lgbm_proba)
lgbm_by_domain = evaluate_by_group(y_test, lgbm_pred, lgbm_proba, test_domains, min_samples=50)
lgbm_by_agent  = evaluate_by_group(y_test, lgbm_pred, lgbm_proba, test_agents,  min_samples=50)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2857
[LightGBM] [Info] Number of data points in the train set: 382520, number of used features: 12
[LightGBM] [Info] Start training from score -1.299481
[LightGBM] [Info] Start training from score -0.923150
[LightGBM] [Info] Start training from score -1.108477




In [None]:

def df_to_markdown(df, title=None, max_rows=15, floatfmt=".3f"):
    """Render a Pandas DataFrame as a Markdown table."""
    df_ = df.copy()
    # Round floats
    for col in df_.select_dtypes(include=[float]).columns:
        df_[col] = df_[col].map(lambda x: f"{x:{floatfmt}}" if pd.notnull(x) else "")
    md = df_.head(max_rows).to_markdown(index=False)
    if title:
        display(Markdown(f"### {title}\n\n{md}"))
    else:
        display(Markdown(md))

In [327]:
display(Markdown("## Overall Model Results"))
display(Markdown("----"))
overall_df = pd.DataFrame([
    {"Model": model, **metrics} for model, metrics in results.items()
])
df_to_markdown(overall_df, title="Overall Results")

display(Markdown("## Per-Domain Results"))
display(Markdown("----"))
df_to_markdown(logreg_by_domain, title="Logistic Regression — Per Domain")
df_to_markdown(xgb_by_domain,  title="XGBoost — Per Domain")
df_to_markdown(lgbm_by_domain, title="LightGBM — Per Domain")

display(Markdown("## Per-Agent Results"))
display(Markdown("----"))
df_to_markdown(logreg_by_agent, title="Logistic Regression — Per Agent")
df_to_markdown(xgb_by_agent,  title="XGBoost — Per Agent")
df_to_markdown(lgbm_by_agent, title="LightGBM — Per Agent")

## Overall Model Results

----

### Overall Results

| Model            |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| LogReg_overall   |      0.571 |         0.772 |       0.577 |       0.204 |   0.463 |   0.121 |   0.873 |   0.311 |   0.395 |   0.182 |
| XGBoost_overall  |      0.691 |         0.881 |       0.691 |       0.145 |   0.576 |   0.118 |   0.89  |   0.13  |   0.606 |   0.187 |
| LightGBM_overall |      0.706 |         0.892 |       0.705 |       0.138 |   0.601 |   0.118 |   0.895 |   0.118 |   0.62  |   0.177 |

## Per-Domain Results

----

### Logistic Regression — Per Domain

| group                  |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------------|------:|----------------:|----------------:|----------------:|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| student_essays         | 29044 |            8451 |           10736 |            9857 |      0.567 |         0.772 |       0.605 |       0.2   |   0.745 |   0.167 |   0.859 |   0.35  |   0.211 |   0.083 |
| spiegel_articles       | 14164 |            3294 |            5579 |            5291 |      0.418 |         0.682 |       0.448 |       0.265 |   0.085 |   0.039 |   0.771 |   0.412 |   0.488 |   0.343 |
| bundestag              | 13117 |            3726 |            5012 |            4379 |      0.335 |         0.678 |       0.406 |       0.292 |   0.063 |   0.016 |   0.939 |   0.692 |   0.215 |   0.169 |
| cnn_news               | 12362 |            3773 |            5485 |            3104 |      0.663 |         0.858 |       0.664 |       0.142 |   0.668 |   0.147 |   0.913 |   0.121 |   0.412 |   0.157 |
| blog_authorship_corpus | 12270 |            4032 |            4754 |            3484 |      0.457 |         0.654 |       0.479 |       0.251 |   0.418 |   0.21  |   0.829 |   0.413 |   0.189 |   0.131 |
| house_of_commons       | 10001 |            2590 |            4052 |            3359 |      0.644 |         0.848 |       0.647 |       0.158 |   0.39  |   0.085 |   0.951 |   0.143 |   0.6   |   0.247 |
| arxiv_papers           |  7844 |            1628 |            3113 |            3103 |      0.673 |         0.862 |       0.671 |       0.136 |   0.389 |   0.152 |   0.94  |   0.011 |   0.684 |   0.247 |
| euro_court_cases       |  7435 |            1829 |            2811 |            2795 |      0.704 |         0.88  |       0.697 |       0.14  |   0.513 |   0.123 |   0.815 |   0.01  |   0.763 |   0.287 |
| gutenberg              |  3055 |             478 |            1876 |             701 |      0.628 |         0.897 |       0.639 |       0.109 |   0.416 |   0.122 |   0.874 |   0.016 |   0.626 |   0.189 |

### XGBoost — Per Domain

| group                  |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------------|------:|----------------:|----------------:|----------------:|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| student_essays         | 29044 |            8451 |           10736 |            9857 |      0.782 |         0.928 |       0.787 |       0.105 |   0.821 |   0.123 |   0.907 |   0.082 |   0.632 |   0.11  |
| spiegel_articles       | 14164 |            3294 |            5579 |            5291 |      0.552 |         0.778 |       0.55  |       0.214 |   0.33  |   0.112 |   0.731 |   0.182 |   0.59  |   0.35  |
| bundestag              | 13117 |            3726 |            5012 |            4379 |      0.569 |         0.815 |       0.587 |       0.195 |   0.265 |   0.065 |   0.911 |   0.257 |   0.585 |   0.264 |
| cnn_news               | 12362 |            3773 |            5485 |            3104 |      0.675 |         0.876 |       0.682 |       0.133 |   0.767 |   0.175 |   0.917 |   0.111 |   0.361 |   0.114 |
| blog_authorship_corpus | 12270 |            4032 |            4754 |            3484 |      0.568 |         0.799 |       0.576 |       0.202 |   0.531 |   0.185 |   0.857 |   0.284 |   0.341 |   0.138 |
| house_of_commons       | 10001 |            2590 |            4052 |            3359 |      0.726 |         0.899 |       0.724 |       0.124 |   0.561 |   0.101 |   0.922 |   0.064 |   0.689 |   0.206 |
| arxiv_papers           |  7844 |            1628 |            3113 |            3103 |      0.721 |         0.916 |       0.721 |       0.104 |   0.319 |   0.07  |   0.987 |   0.003 |   0.858 |   0.24  |
| euro_court_cases       |  7435 |            1829 |            2811 |            2795 |      0.751 |         0.912 |       0.749 |       0.113 |   0.587 |   0.132 |   0.932 |   0.019 |   0.729 |   0.189 |
| gutenberg              |  3055 |             478 |            1876 |             701 |      0.613 |         0.912 |       0.649 |       0.09  |   0.136 |   0.048 |   0.93  |   0.019 |   0.882 |   0.204 |

### LightGBM — Per Domain

| group                  |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------------|------:|----------------:|----------------:|----------------:|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| student_essays         | 29044 |            8451 |           10736 |            9857 |      0.813 |         0.946 |       0.817 |       0.09  |   0.852 |   0.11  |   0.923 |   0.068 |   0.675 |   0.092 |
| spiegel_articles       | 14164 |            3294 |            5579 |            5291 |      0.564 |         0.788 |       0.562 |       0.209 |   0.365 |   0.121 |   0.75  |   0.18  |   0.571 |   0.327 |
| bundestag              | 13117 |            3726 |            5012 |            4379 |      0.588 |         0.824 |       0.601 |       0.188 |   0.299 |   0.074 |   0.902 |   0.226 |   0.604 |   0.264 |
| cnn_news               | 12362 |            3773 |            5485 |            3104 |      0.681 |         0.879 |       0.688 |       0.13  |   0.779 |   0.178 |   0.915 |   0.103 |   0.369 |   0.11  |
| blog_authorship_corpus | 12270 |            4032 |            4754 |            3484 |      0.578 |         0.806 |       0.585 |       0.198 |   0.54  |   0.188 |   0.849 |   0.261 |   0.366 |   0.145 |
| house_of_commons       | 10001 |            2590 |            4052 |            3359 |      0.731 |         0.903 |       0.729 |       0.122 |   0.573 |   0.101 |   0.917 |   0.06  |   0.697 |   0.204 |
| arxiv_papers           |  7844 |            1628 |            3113 |            3103 |      0.734 |         0.918 |       0.732 |       0.101 |   0.351 |   0.07  |   0.989 |   0.003 |   0.857 |   0.229 |
| euro_court_cases       |  7435 |            1829 |            2811 |            2795 |      0.764 |         0.918 |       0.763 |       0.107 |   0.613 |   0.131 |   0.943 |   0.017 |   0.732 |   0.174 |
| gutenberg              |  3055 |             478 |            1876 |             701 |      0.639 |         0.914 |       0.661 |       0.088 |   0.213 |   0.059 |   0.936 |   0.018 |   0.833 |   0.187 |

## Per-Agent Results

----

### Logistic Regression — Per Agent

| group            |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro | AUROC_macro   |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------|------:|----------------:|----------------:|----------------:|-----------:|:--------------|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| gemma2:9b        | 40289 |               0 |           21645 |           18644 |      0.458 |               |       0.437 |       0.18  |   0     |   0.132 |   0.893 |   0.339 |   0.418 |   0.07  |
| gpt-4o-mini      | 33489 |               0 |           18627 |           14862 |      0.42  |               |       0.407 |       0.22  |   0     |   0.106 |   0.856 |   0.46  |   0.364 |   0.095 |
| human            | 29801 |           29801 |               0 |               0 |      0.211 |               |       0.154 |       0.179 |   0.463 |   0     |   0     |   0.214 |   0     |   0.323 |
| phi3:3.8b        |  2601 |               0 |            1402 |            1199 |      0.42  |               |       0.405 |       0.217 |   0     |   0.118 |   0.837 |   0.401 |   0.378 |   0.133 |
| nemotron         |   920 |               0 |             560 |             360 |      0.395 |               |       0.36  |       0.237 |   0     |   0.177 |   0.657 |   0.311 |   0.422 |   0.223 |
| deepseek-r1:1.5b |   899 |               0 |             500 |             399 |      0.477 |               |       0.442 |       0.159 |   0     |   0.165 |   0.87  |   0.213 |   0.456 |   0.098 |
| deepseek-r1:32b  |   633 |               0 |             326 |             307 |      0.474 |               |       0.46  |       0.171 |   0     |   0.101 |   0.917 |   0.342 |   0.463 |   0.071 |
| o3-mini          |   474 |               0 |             259 |             215 |      0.44  |               |       0.433 |       0.203 |   0     |   0.078 |   0.919 |   0.47  |   0.381 |   0.062 |
| gpt-4-turbo      |   186 |               0 |              99 |              87 |      0.437 |               |       0.428 |       0.213 |   0     |   0.07  |   0.848 |   0.437 |   0.437 |   0.131 |

### XGBoost — Per Agent

| group            |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro | AUROC_macro   |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------|------:|----------------:|----------------:|----------------:|-----------:|:--------------|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| gemma2:9b        | 40289 |               0 |           21645 |           18644 |      0.523 |               |       0.49  |       0.126 |   0     |   0.134 |   0.892 |   0.161 |   0.577 |   0.084 |
| gpt-4o-mini      | 33489 |               0 |           18627 |           14862 |      0.543 |               |       0.516 |       0.112 |   0     |   0.098 |   0.892 |   0.151 |   0.656 |   0.086 |
| human            | 29801 |           29801 |               0 |               0 |      0.244 |               |       0.192 |       0.141 |   0.576 |   0     |   0     |   0.09  |   0     |   0.334 |
| phi3:3.8b        |  2601 |               0 |            1402 |            1199 |      0.476 |               |       0.454 |       0.169 |   0     |   0.115 |   0.884 |   0.299 |   0.479 |   0.093 |
| nemotron         |   920 |               0 |             560 |             360 |      0.476 |               |       0.431 |       0.164 |   0     |   0.179 |   0.78  |   0.214 |   0.514 |   0.1   |
| deepseek-r1:1.5b |   899 |               0 |             500 |             399 |      0.528 |               |       0.484 |       0.116 |   0     |   0.168 |   0.894 |   0.12  |   0.559 |   0.06  |
| deepseek-r1:32b  |   633 |               0 |             326 |             307 |      0.525 |               |       0.503 |       0.13  |   0     |   0.098 |   0.905 |   0.215 |   0.603 |   0.077 |
| o3-mini          |   474 |               0 |             259 |             215 |      0.505 |               |       0.482 |       0.149 |   0     |   0.091 |   0.861 |   0.228 |   0.586 |   0.127 |
| gpt-4-turbo      |   186 |               0 |              99 |              87 |      0.487 |               |       0.463 |       0.163 |   0     |   0.108 |   0.838 |   0.241 |   0.552 |   0.141 |

### LightGBM — Per Agent

| group            |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro | AUROC_macro   |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------|------:|----------------:|----------------:|----------------:|-----------:|:--------------|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| gemma2:9b        | 40289 |               0 |           21645 |           18644 |      0.524 |               |       0.49  |       0.124 |   0     |   0.139 |   0.889 |   0.147 |   0.581 |   0.087 |
| gpt-4o-mini      | 33489 |               0 |           18627 |           14862 |      0.557 |               |       0.53  |       0.1   |   0     |   0.091 |   0.907 |   0.134 |   0.685 |   0.074 |
| human            | 29801 |           29801 |               0 |               0 |      0.25  |               |       0.2   |       0.133 |   0.601 |   0     |   0     |   0.082 |   0     |   0.317 |
| phi3:3.8b        |  2601 |               0 |            1402 |            1199 |      0.481 |               |       0.457 |       0.164 |   0     |   0.123 |   0.889 |   0.287 |   0.482 |   0.081 |
| nemotron         |   920 |               0 |             560 |             360 |      0.499 |               |       0.453 |       0.145 |   0     |   0.172 |   0.798 |   0.175 |   0.561 |   0.089 |
| deepseek-r1:1.5b |   899 |               0 |             500 |             399 |      0.528 |               |       0.485 |       0.116 |   0     |   0.166 |   0.892 |   0.115 |   0.564 |   0.066 |
| deepseek-r1:32b  |   633 |               0 |             326 |             307 |      0.534 |               |       0.51  |       0.122 |   0     |   0.098 |   0.899 |   0.176 |   0.632 |   0.092 |
| o3-mini          |   474 |               0 |             259 |             215 |      0.507 |               |       0.482 |       0.145 |   0     |   0.103 |   0.873 |   0.223 |   0.572 |   0.108 |
| gpt-4-turbo      |   186 |               0 |              99 |              87 |      0.5   |               |       0.477 |       0.152 |   0     |   0.102 |   0.879 |   0.253 |   0.552 |   0.101 |