# About

Simple baselines for GhostWriter dataset (e.g. linear regression).

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
!pip install xgboost lightgbm tabulate



In [13]:
import re, math, string, numpy as np
import pandas as pd
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from tqdm.auto import tqdm
from typing import List, Optional, Tuple
from IPython.display import display, Markdown
from datasets import DatasetDict, load_dataset
from sklearn.pipeline import Pipeline
from luminar.utils import get_best_device
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import label_binarize
from sklearn.utils import check_random_state

rng = check_random_state(42)

In [14]:
# Helpers
_punct_tbl = str.maketrans("", "", "")
punct_set = set(string.punctuation)
url_pat = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
sent_pat = re.compile(r"[.!?]")
nonspace_pat = re.compile(r"\S")
alnum_pat = re.compile(r"[A-Za-z0-9]")

def safe_div(a, b):
    return a / b if b else 0.0

def tokenize_whitespace_strip_punct(text):
    toks = []
    for tok in text.split():
        # strip leading/trailing punctuation
        tok = tok.strip(string.punctuation)
        if tok and alnum_pat.search(tok):
            toks.append(tok)
    return toks

def extract_metrics_batch(batch):
    texts = batch["text"]
    out = {
        "n_chars": [],
        "n_chars_nospace": [],
        "n_words": [],
        "avg_word_len": [],
        "n_sents": [],
        "n_punct": [],
        "punct_ratio": [],
        "upper_ratio": [],
        "digit_ratio": [],
        "url_count": [],
        "type_token_ratio": [],
        "hapax_ratio": [],
    }
    for t in texts:
        t = t if isinstance(t, str) else ""
        n_chars = len(t)
        n_chars_nospace = len(re.findall(r"\S", t))
        n_punct = sum(1 for ch in t if (not ch.isalnum()) and (not ch.isspace()))
        n_upper = sum(1 for ch in t if ch.isupper())
        n_digit = sum(1 for ch in t if ch.isdigit())
        url_count = len(url_pat.findall(t))

        # sentence count (min 1 for non-empty text)
        sent_splits = [s for s in sent_pat.split(t) if s.strip()]
        n_sents = max(1, len(sent_splits)) if t.strip() else 0

        toks = tokenize_whitespace_strip_punct(t)
        n_words = len(toks)
        avg_word_len = safe_div(sum(len(w) for w in toks), n_words)

        # lexical diversity
        toks_lower = [w.lower() for w in toks]
        vocab = set(toks_lower)
        type_token_ratio = safe_div(len(vocab), n_words)

        # hapax legomena ratio
        from collections import Counter
        cnt = Counter(toks_lower)
        hapax_ratio = safe_div(sum(1 for w, c in cnt.items() if c == 1), n_words)

        punct_ratio = safe_div(n_punct, n_chars_nospace)
        upper_ratio = safe_div(n_upper, n_chars_nospace)
        digit_ratio = safe_div(n_digit, n_chars_nospace)

        out["n_chars"].append(n_chars)
        out["n_chars_nospace"].append(n_chars_nospace)
        out["n_words"].append(n_words)
        out["avg_word_len"].append(avg_word_len)
        out["n_sents"].append(n_sents)
        out["n_punct"].append(n_punct)
        out["punct_ratio"].append(punct_ratio)
        out["upper_ratio"].append(upper_ratio)
        out["digit_ratio"].append(digit_ratio)
        out["url_count"].append(url_count)
        out["type_token_ratio"].append(type_token_ratio)
        out["hapax_ratio"].append(hapax_ratio)

    return out

In [15]:
FEATURES = [
    "n_chars","n_chars_nospace","n_words","avg_word_len","n_sents","n_punct",
    "punct_ratio","upper_ratio","digit_ratio","url_count","type_token_ratio","hapax_ratio",
]

## Dataset & Feature Extraction

In [16]:
dataset = load_dataset("TheItCrOw/GhostWriter")

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 382535
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 54648
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type'],
        num_rows: 109296
    })
})

In [18]:
# Remove empty texts
dataset01 = DatasetDict({
    split: ds.filter(lambda ex: isinstance(ex["text"], str) and ex["text"].strip() != "")
    for split, ds in dataset.items()
})

def clean(batch):
    cleaned = []
    for t in batch["text"]:
        # Collapse multiple spaces and trim
        t = re.sub(r"\s+", " ", t).strip()
        cleaned.append(t)
    return {"text": cleaned}

dataset01 = DatasetDict({
    split: ds.map(clean, batched=True, num_proc=32)
    for split, ds in dataset01.items()
})


In [19]:
# Feature extraction 
dataset_feats = DatasetDict({
    split: ds.map(extract_metrics_batch, batched=True, num_proc=32)
    for split, ds in dataset01.items()
})
dataset_feats['train'][0]

{'id': '41a1cb62-9edd-4aa2-8083-e26165fcc9c2',
 'text': 'In today’s world, technology has woven itself into the very fabric of our daily lives, and its impact on understanding human emotions is both fascinating and concerning. As someone who has grown up with smartphones and social media, I can’t help but wonder how this tech shapes our ability to recognize genuine feelings, especially in educational settings. For instance, tools that analyze facial expressions or tone of voice can be incredibly useful for teachers trying to gauge student engagement. However, I often feel that relying too heavily on these technologies can lead to misunderstandings. Emotions are complex and nuanced; a smile doesn’t always mean happiness, and a frown doesn’t always signify sadness. By placing too much trust in algorithms, we risk oversimplifying the rich tapestry of human emotion. I believe that while technology can aid our understanding, it shouldn’t replace our instinctual ability to connect with other

In [20]:
# Prepare arrays
def to_xy(ds):
    X = np.column_stack([np.array(ds[f], dtype=np.float32) for f in FEATURES])
    y = np.array(ds["label"], dtype=np.int64)
    return X, y

In [21]:
X_train, y_train = to_xy(dataset_feats["train"])
X_test,  y_test  = to_xy(dataset_feats["test"])
train_texts = dataset01["train"]["text"]
test_texts  = dataset01["test"]["text"]

print("Train size:", X_train.shape, " Test size:", X_test.shape)
print("Class balance (train):", {c:int((y_train==c).sum()) for c in [0, 1, 2]})
print("Class balance (test):",  {c:int((y_test==c).sum()) for c in [0, 1, 2]})

Train size: (382520, 12)  Test size: (109292, 12)
Class balance (train): {0: 104303, 1: 151962, 2: 126255}
Class balance (test): {0: 29801, 1: 43418, 2: 36073}


## Models

In [22]:
NUM_CLASSES = 3
LABELS = [0, 1, 2]

In [23]:
def _safe_tpr_fpr(cm):
    # We'll compute FPR_i via positives predicted for class i vs true negatives.
    tp = np.diag(cm).astype(float)
    row_sum = cm.sum(axis=1).astype(float)
    col_sum = cm.sum(axis=0).astype(float)
    total = cm.sum().astype(float)

    tpr = np.divide(tp, np.maximum(row_sum, 1), out=np.zeros_like(tp), where=row_sum>0)

    fp = col_sum - tp
    tn = total - row_sum - col_sum + tp
    denom = fp + tn
    fpr = np.divide(fp, np.maximum(denom, 1), out=np.zeros_like(fp), where=denom>0)

    return tpr, fpr

def compute_metrics_multiclass(y_true, y_pred, y_proba):
    """
    Computes macro F1, macro AUROC, per-class TPR/FPR, and macro TPR/FPR averages.
    """
    f1_macro = f1_score(y_true, y_pred, labels=LABELS, average='macro', zero_division=0)

    # AUROC (macro one-vs-rest)
    uniq = np.unique(y_true)
    if len(uniq) >= 2:
        y_true_bin = label_binarize(y_true, classes=LABELS)
        auroc_macro = roc_auc_score(y_true_bin, y_proba, average='macro', multi_class='ovr')
    else:
        auroc_macro = float('nan')

    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    tpr, fpr = _safe_tpr_fpr(cm)

    # Macro-averaged TPR/FPR
    tpr_macro = float(np.mean(tpr))
    fpr_macro = float(np.mean(fpr))

    # Flatten per-class metrics into a dict
    per_class = {}
    for i, lab in enumerate(LABELS):
        per_class[f"TPR_{lab}"] = float(tpr[i])
        per_class[f"FPR_{lab}"] = float(fpr[i])

    out = {
        "F1_macro": float(f1_macro),
        "AUROC_macro": float(auroc_macro),
        "TPR_macro": tpr_macro,
        "FPR_macro": fpr_macro,
    }
    out.update(per_class)
    return out

def evaluate_by_group(y_true, y_pred, y_proba, group_values, min_samples=50):
    """
    Returns a pandas DataFrame with metrics per group.
    - Skips AUROC for groups that contain <2 classes (sets AUROC_macro=nan)
    - Drops groups with < min_samples rows
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    y_proba = np.asarray(y_proba)
    group_values = np.asarray(group_values)

    rows = []
    for g in np.unique(group_values):
        idx = np.where(group_values == g)[0]
        if idx.size < min_samples:
            continue

        yt = y_true[idx]
        yp = y_pred[idx]
        probs = y_proba[idx]

        metrics = compute_metrics_multiclass(yt, yp, probs)
        # Also include size and per-class counts for context
        counts = {f"count_label_{lab}": int((yt == lab).sum()) for lab in LABELS}
        row = {"group": g, "n": int(idx.size)}
        row.update(counts)
        row.update(metrics)
        rows.append(row)

    if not rows:
        return pd.DataFrame(columns=["group", "n"] + [f"count_label_{l}" for l in LABELS]
                                      + ["F1_macro", "AUROC_macro"]
                                      + [f"TPR_{l}" for l in LABELS] + [f"FPR_{l}" for l in LABELS])
    df = pd.DataFrame(rows).sort_values("n", ascending=False).reset_index(drop=True)
    return df

In [24]:
test_domains = np.array(dataset01["test"]["domain"])
test_agents  = np.array(dataset01["test"]["agent"])

results = {}

In [15]:
# Logistic Regression (multinomial)
logreg = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=1000, multi_class="multinomial"))
])
logreg.fit(X_train, y_train)
logreg_pred  = logreg.predict(X_test)
logreg_proba = logreg.predict_proba(X_test)
results["LogReg_overall"] = compute_metrics_multiclass(y_test, logreg_pred, logreg_proba)
logreg_by_domain = evaluate_by_group(y_test, logreg_pred, logreg_proba, test_domains, min_samples=50)
logreg_by_agent  = evaluate_by_group(y_test, logreg_pred, logreg_proba, test_agents,  min_samples=50)



In [19]:
# XGBoost (softmax)
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multi:softprob",
    num_class=NUM_CLASSES,
    eval_metric="mlogloss",
    n_jobs=-1,
    random_state=42,
)
xgb.fit(X_train, y_train)
xgb_proba = xgb.predict_proba(X_test)
xgb_pred  = np.argmax(xgb_proba, axis=1)
results["XGBoost_overall"] = compute_metrics_multiclass(y_test, xgb_pred, xgb_proba)
xgb_by_domain = evaluate_by_group(y_test, xgb_pred, xgb_proba, test_domains, min_samples=50)
xgb_by_agent  = evaluate_by_group(y_test, xgb_pred, xgb_proba, test_agents,  min_samples=50)



In [20]:
# LightGBM (multiclass)
lgbm = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="multiclass",
    num_class=NUM_CLASSES,
    random_state=42,
    n_jobs=-1,
)
lgbm.fit(X_train, y_train)
lgbm_proba = lgbm.predict_proba(X_test)
lgbm_pred  = np.argmax(lgbm_proba, axis=1)
results["LightGBM_overall"] = compute_metrics_multiclass(y_test, lgbm_pred, lgbm_proba)
lgbm_by_domain = evaluate_by_group(y_test, lgbm_pred, lgbm_proba, test_domains, min_samples=50)
lgbm_by_agent  = evaluate_by_group(y_test, lgbm_pred, lgbm_proba, test_agents,  min_samples=50)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028591 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2857
[LightGBM] [Info] Number of data points in the train set: 382520, number of used features: 12
[LightGBM] [Info] Start training from score -1.299481
[LightGBM] [Info] Start training from score -0.923150
[LightGBM] [Info] Start training from score -1.108477


KeyboardInterrupt: 

In [25]:
train_texts_list = []
test_texts_list = []
for t in train_texts:
    train_texts_list.append(t)
for t in test_texts:
    test_texts_list.append(t)

train_texts_list[0]
test_texts_list[0]

'With permission, Mr. Speaker, I should like to make a personal statement. During the heat of debate, strong feelings are expressed on both sides of the House. I hope that, in my time here, I have always shown proper respect for the occupant of the Chair and observed his or her rulings. As you will be aware, Mr. Speaker, I was not asked by the Deputy Speaker in Westminster Hall yesterday to withdraw my remarks when they were made. However, on reflection, I accept that it would have been better if I had not used the phrase that I applied to the hon. Member for Glasgow, Kelvin , and I am sorry for the offence that was caused. With permission, Mr. Speaker, I should like to make a personal statement. In the debate in Westminster Hall yesterday, exchanges became frank to the point of being unacceptable, and I should like to apologise to the Deputy Speaker in Westminster Hall, my hon. Friend the Member for Blaydon , to you, Mr. Speaker, and to the House for my part in that. The issues under 

Following taken from a kaggle challenge of mine.

In [26]:
# Helpers unchanged
class _TokDataset(Dataset):
    """Pre-tokenized dataset (unsupervised) with optional progress for big inputs."""
    def __init__(
        self,
        texts: List[str],
        tokenizer,
        max_len: int,
        show_progress: bool = False,
        tokenize_batch_size: Optional[int] = None,
    ):
        if tokenize_batch_size and len(texts) > tokenize_batch_size:
            pieces = []
            rng = range(0, len(texts), tokenize_batch_size)
            iterator = tqdm(rng, desc="Tokenizing", unit="chunk", total=len(rng)) if show_progress else rng
            for start in iterator:
                chunk = texts[start:start + tokenize_batch_size]
                enc = tokenizer(
                    chunk,
                    padding=True,
                    truncation=True,
                    max_length=max_len,
                    return_tensors="pt",
                )
                pieces.append(enc)
            keys = pieces[0].keys()
            self.enc = {k: torch.cat([p[k] for p in pieces], dim=0) for k in keys}
        else:
            self.enc = tokenizer(
                texts,
                padding=True,
                truncation=True,
                max_length=max_len,
                return_tensors="pt",
            )

    def __len__(self) -> int:
        return self.enc["input_ids"].size(0)

    def __getitem__(self, idx: int):
        return {k: v[idx] for k, v in self.enc.items()}


class _TokClsDataset(_TokDataset):
    """Pre-tokenized dataset (supervised)."""
    def __init__(self, texts: List[str], labels: np.ndarray, tokenizer, max_len: int,
                 show_progress: bool = False, tokenize_batch_size: Optional[int] = None):
        super().__init__(texts, tokenizer, max_len, show_progress, tokenize_batch_size)
        self.labels = torch.as_tensor(labels, dtype=torch.long)

    def __getitem__(self, idx: int):
        item = super().__getitem__(idx)
        item["labels"] = self.labels[idx]
        return item


# Finetune head
class _ClsHead(nn.Module):
    def __init__(self, hidden_size: int, num_labels: int, dropout: float = 0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(hidden_size, num_labels)

    def forward(self, pooled):
        return self.out(self.dropout(pooled))


class BERTClassifier:
    """
    Dual-mode classifier:
      - Frozen mode (default): Transformer as feature extractor -> sklearn LogisticRegression.
      - Finetune mode: End-to-end training of transformer + torch classification head.

    Public API:
      - fit(texts, labels)
      - predict_proba(texts)
      - predict(texts)
      - predict_proba_and_pred(texts)
    """

    def __init__(
        self,
        model_name: str = "roberta-base",
        pooling: str = "mean",                # 'mean' or 'cls'
        max_len: int = 512,
        batch_size: int = 128,
        use_scaler: bool = True,              # used in frozen mode only

        # LogisticRegression params (frozen mode)
        lr_max_iter: int = 1000,
        lr_C: float = 1.0,
        lr_n_jobs: int = -1,
        lr_solver: str = "lbfgs",

        random_state: int = 42,

        # System / perf
        device: Optional[str] = None,
        num_workers: int = 4,
        use_fp16: bool = True,                # IGNORED (kept for API compatibility)
        allow_tf32: bool = True,

        # Progress
        show_progress: bool = True,
        tokenize_batch_size: Optional[int] = 4096,

        # --- Finetune controls ---
        finetune: bool = False,               # True -> train transformer end-to-end
        ft_epochs: int = 3,
        ft_lr: float = 2e-5,
        ft_weight_decay: float = 0.01,
        ft_warmup_ratio: float = 0.06,
        ft_max_grad_norm: float = 1.0,
        ft_gradient_accumulation: int = 1,
        ft_dropout: float = 0.1,
    ):
        # config
        self.model_name = model_name
        self.pooling = pooling.lower()
        assert self.pooling in ("mean", "cls"), "pooling must be 'mean' or 'cls'"
        self.max_len = int(max_len)
        self.batch_size = int(batch_size)
        self.use_scaler = use_scaler
        self.random_state = random_state
        self.num_workers = int(num_workers)
        self.use_fp16 = False                 # hard-disable AMP/FP16
        self.allow_tf32 = bool(allow_tf32)
        self.show_progress = bool(show_progress)
        self.tokenize_batch_size = tokenize_batch_size

        self.finetune = bool(finetune)
        self.ft_epochs = int(ft_epochs)
        self.ft_lr = float(ft_lr)
        self.ft_weight_decay = float(ft_weight_decay)
        self.ft_warmup_ratio = float(ft_warmup_ratio)
        self.ft_max_grad_norm = float(ft_max_grad_norm)
        self.ft_gradient_accumulation = int(ft_gradient_accumulation)
        self.ft_dropout = float(ft_dropout)

        self.device = str(get_best_device())

        # tokenizer & backbone
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
        self.backbone = AutoModel.from_pretrained(model_name).to(self.device)
        self.backbone.eval()

        if self.device.startswith("cuda") and self.allow_tf32:
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True

        # sklearn pipeline for frozen mode
        self.clf: Optional[Pipeline] = None
        if not self.finetune:
            steps = []
            if self.use_scaler:
                steps.append(("scaler", StandardScaler(with_mean=True, with_std=True)))
            steps.append(("lr", LogisticRegression(
                max_iter=lr_max_iter,
                multi_class="multinomial",
                n_jobs=lr_n_jobs,
                C=lr_C,
                solver=lr_solver,
                random_state=random_state
            )))
            self.clf = Pipeline(steps)

        # torch head for finetune mode (created lazily in fit once num_labels known)
        self.num_labels: Optional[int] = None
        self.ft_head: Optional[_ClsHead] = None
        self.criterion = nn.CrossEntropyLoss()

    def _pool(self, last_hidden_state, attention_mask):
        if self.pooling == "mean":
            mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
            summed = (last_hidden_state * mask).sum(dim=1)
            counts = mask.sum(dim=1).clamp(min=1e-9)
            return summed / counts
        else:
            return last_hidden_state[:, 0, :]

    # Frozen path: embed -> sklearn (FP32 only)
    @torch.inference_mode()
    def _embed_loader(self, loader: DataLoader) -> np.ndarray:
        outs = []
        iterator = tqdm(loader, total=len(loader), desc="Embedding", unit="batch") if self.show_progress else loader
        for batch in iterator:
            batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
            out = self.backbone(**batch).last_hidden_state
            pooled = self._pool(out, batch["attention_mask"])
            outs.append(pooled.detach().cpu())
        return torch.cat(outs, dim=0).to(dtype=torch.float32).numpy()

    @torch.inference_mode()
    def embed(self, texts: List[str]) -> np.ndarray:
        if len(texts) == 0:
            h = getattr(self.backbone.config, "hidden_size", None) or 768
            return np.zeros((0, h), dtype=np.float32)

        ds = _TokDataset(
            texts, self.tokenizer, self.max_len,
            show_progress=self.show_progress,
            tokenize_batch_size=self.tokenize_batch_size,
        )
        loader = DataLoader(
            ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.device.startswith("cuda"),
            persistent_workers=(self.num_workers > 0),
        )
        return self._embed_loader(loader)

    def fit(self, texts: List[str], labels: np.ndarray):
        labels = np.asarray(labels)

        if not self.finetune:
            # Frozen mode: embed then sklearn
            X = self.embed(texts)
            assert self.clf is not None
            self.clf.fit(X, labels)
            return self

        unique_labels = np.unique(labels)
        self.num_labels = int(unique_labels.size)

        ds = _TokClsDataset(
            texts, labels,
            tokenizer=self.tokenizer,
            max_len=self.max_len,
            show_progress=self.show_progress,
            tokenize_batch_size=self.tokenize_batch_size,
        )
        loader = DataLoader(
            ds,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers,
            pin_memory=self.device.startswith("cuda"),
            persistent_workers=(self.num_workers > 0),
        )

        hidden_size = getattr(self.backbone.config, "hidden_size", 768)
        self.ft_head = _ClsHead(hidden_size, self.num_labels, dropout=self.ft_dropout).to(self.device)

        # Trainables: backbone + head
        self.backbone.train()
        self.ft_head.train()

        # Ensure FP32 params
        self.backbone.float()
        self.ft_head.float()

        # Optimizer & scheduler
        optim = torch.optim.AdamW(
            list(self.backbone.parameters()) + list(self.ft_head.parameters()),
            lr=self.ft_lr,
            weight_decay=self.ft_weight_decay,
        )

        steps_per_epoch = math.ceil(len(ds) / self.batch_size)
        total_steps = (steps_per_epoch * self.ft_epochs) // max(1, self.ft_gradient_accumulation)
        warmup_steps = int(self.ft_warmup_ratio * total_steps)
        scheduler = get_linear_schedule_with_warmup(optim, warmup_steps, total_steps)

        grad_accum = max(1, self.ft_gradient_accumulation)

        for epoch in range(self.ft_epochs):
            it = tqdm(loader, desc=f"Finetuning (epoch {epoch+1}/{self.ft_epochs})", unit="batch") \
                if self.show_progress else loader

            optim.zero_grad(set_to_none=True)

            for step, batch in enumerate(it, start=1):
                labels_t = batch.pop("labels").to(self.device, non_blocking=True)
                batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}

                out = self.backbone(**batch)
                pooled = self._pool(out.last_hidden_state, batch["attention_mask"])
                logits = self.ft_head(pooled)
                loss = self.criterion(logits, labels_t)
                loss = loss / grad_accum  # normalize for accumulation

                loss.backward()

                # Step optimizer/scheduler every grad_accum steps or at the end
                if (step % grad_accum == 0) or (step == len(loader)):
                    # Optional gradient clipping
                    if self.ft_max_grad_norm and self.ft_max_grad_norm > 0:
                        torch.nn.utils.clip_grad_norm_(
                            list(self.backbone.parameters()) + list(self.ft_head.parameters()),
                            self.ft_max_grad_norm
                        )
                    optim.step()
                    scheduler.step()
                    optim.zero_grad(set_to_none=True)

                if self.show_progress and hasattr(it, "set_postfix"):
                    it.set_postfix(loss=float(loss.detach().cpu()) * grad_accum)

        # back to eval for inference
        self.backbone.eval()
        self.ft_head.eval()
        return self

    @torch.inference_mode()
    def _infer_logits(self, texts: List[str]) -> torch.Tensor:
        assert self.finetune and self.ft_head is not None, "Only for finetune mode."
        if len(texts) == 0:
            return torch.zeros((0, self.num_labels), dtype=torch.float32)

        ds = _TokDataset(
            texts, self.tokenizer, self.max_len,
            show_progress=self.show_progress,
            tokenize_batch_size=self.tokenize_batch_size,
        )
        loader = DataLoader(
            ds,
            batch_size=self.batch_size,
            shuffle=False,
            num_workers=self.num_workers,
            pin_memory=self.device.startswith("cuda"),
            persistent_workers=(self.num_workers > 0),
        )

        iterator = tqdm(loader, total=len(loader), desc="Infer (finetune)", unit="batch") \
                   if self.show_progress else loader

        all_logits = []
        for batch in iterator:
            batch = {k: v.to(self.device, non_blocking=True) for k, v in batch.items()}
            last_hid = self.backbone(**batch).last_hidden_state
            pooled = self._pool(last_hid, batch["attention_mask"])
            logits = self.ft_head(pooled)
            all_logits.append(logits.detach().cpu())
        return torch.cat(all_logits, dim=0).to(dtype=torch.float32)

    def predict_proba(self, texts: List[str]) -> np.ndarray:
        if not self.finetune:
            X = self.embed(texts)
            assert self.clf is not None
            return self.clf.predict_proba(X)
        logits = self._infer_logits(texts)
        proba = torch.softmax(logits, dim=-1).numpy()
        return proba

    def predict(self, texts: List[str]) -> np.ndarray:
        return self.predict_proba(texts).argmax(axis=1)

    def predict_proba_and_pred(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        proba = self.predict_proba(texts)
        pred = proba.argmax(axis=1)
        return proba, pred


In [27]:
# Finetune
clf = BERTClassifier(
    model_name="xlm-roberta-base",
    pooling="mean",
    finetune=True,
    ft_epochs=5,
    ft_lr=2e-5,
    batch_size=128,
    show_progress=True
)
clf.fit(train_texts, y_train)
proba, pred = clf.predict_proba_and_pred(test_texts)
results["RoBERTa+Finetune_overall"] = compute_metrics_multiclass(y_test, pred, proba)
ro_ft_by_domain = evaluate_by_group(y_test, pred, proba, test_domains, min_samples=50)
ro_ft_by_agent  = evaluate_by_group(y_test, pred, proba, test_agents,  min_samples=50)

Tokenizing:   0%|          | 0/94 [00:00<?, ?chunk/s]

Finetuning (epoch 1/5):   0%|          | 0/2989 [00:08<?, ?batch/s]

Finetuning (epoch 2/5):   0%|          | 0/2989 [00:00<?, ?batch/s]

Finetuning (epoch 3/5):   0%|          | 0/2989 [00:00<?, ?batch/s]

Finetuning (epoch 4/5):   0%|          | 0/2989 [00:00<?, ?batch/s]

Finetuning (epoch 5/5):   0%|          | 0/2989 [00:00<?, ?batch/s]

Tokenizing:   0%|          | 0/27 [00:00<?, ?chunk/s]

Infer (finetune):   0%|          | 0/854 [00:07<?, ?batch/s]



In [19]:
# Frozen
ro_baseline = BERTClassifier(
    model_name="xlm-roberta-base",
    pooling="mean",
    max_len=512,
    batch_size=512,
    finetune=False
)
ro_baseline.fit(train_texts_list, y_train)
ro_proba, ro_pred = ro_baseline.predict_proba_and_pred(test_texts_list)
results["RoBERTa+LR_overall"] = compute_metrics_multiclass(y_test, ro_pred, ro_proba)
ro_by_domain = evaluate_by_group(y_test, ro_pred, ro_proba, test_domains, min_samples=50)
ro_by_agent  = evaluate_by_group(y_test, ro_pred, ro_proba, test_agents,  min_samples=50)

Tokenizing:   0%|          | 0/94 [00:00<?, ?texts/s]

Embedding:   0%|          | 0/748 [00:05<?, ?batch/s]

  with torch.cuda.amp.autocast(enabled=amp_enabled):


Tokenizing:   0%|          | 0/27 [00:00<?, ?texts/s]

Embedding:   0%|          | 0/214 [00:05<?, ?batch/s]

  with torch.cuda.amp.autocast(enabled=amp_enabled):


In [28]:

def df_to_markdown(df, title=None, max_rows=15, floatfmt=".3f"):
    df_ = df.copy()
    for col in df_.select_dtypes(include=[float]).columns:
        df_[col] = df_[col].map(lambda x: f"{x:{floatfmt}}" if pd.notnull(x) else "")
    md = df_.head(max_rows).to_markdown(index=False)
    if title:
        display(Markdown(f"### {title}\n\n{md}"))
    else:
        display(Markdown(md))

In [29]:
display(Markdown("## Overall Model Results"))
display(Markdown("---"))

overall_df = pd.DataFrame([
    {"Model": model, **metrics} for model, metrics in results.items()
])
df_to_markdown(overall_df.sort_values("F1_macro", ascending=False).reset_index(drop=True),
               title="Overall Results (sorted by F1_macro)")

display(Markdown("## Per-Domain Results"))
display(Markdown("---"))
df_to_markdown(logreg_by_domain,  title="Logistic Regression — Per Domain")
df_to_markdown(xgb_by_domain,     title="XGBoost — Per Domain")
df_to_markdown(lgbm_by_domain,    title="LightGBM — Per Domain")
df_to_markdown(ro_by_domain,      title="RoBERTa+LR — Per Domain")
df_to_markdown(ro_ft_by_domain,      title="RoBERTa+Finetune — Per Domain")

display(Markdown("## Per-Agent Results"))
display(Markdown("---"))
df_to_markdown(logreg_by_agent,  title="Logistic Regression — Per Agent")
df_to_markdown(xgb_by_agent,     title="XGBoost — Per Agent")
df_to_markdown(lgbm_by_agent,    title="LightGBM — Per Agent")
df_to_markdown(ro_by_agent,      title="RoBERTa+LR — Per Agent")
df_to_markdown(ro_ft_by_agent,      title="RoBERTa+Finetune — Per Agent")

## Overall Model Results

---

### Overall Results (sorted by F1_macro)

| Model                    |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-------------------------|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| RoBERTa+Finetune_overall |      0.892 |         0.983 |       0.891 |       0.047 |   0.797 |   0.053 |   0.997 |   0.006 |   0.879 |   0.081 |

## Per-Domain Results

---

### RoBERTa+Finetune — Per Domain

| group                  |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro |   AUROC_macro |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------------|------:|----------------:|----------------:|----------------:|-----------:|--------------:|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| student_essays         | 29044 |            8451 |           10736 |            9857 |      0.965 |         0.997 |       0.965 |       0.016 |   0.945 |   0.023 |   0.999 |   0.001 |   0.951 |   0.024 |
| spiegel_articles       | 14164 |            3294 |            5579 |            5291 |      0.861 |         0.977 |       0.858 |       0.059 |   0.714 |   0.06  |   1     |   0.014 |   0.861 |   0.103 |
| bundestag              | 13117 |            3726 |            5012 |            4379 |      0.853 |         0.959 |       0.857 |       0.065 |   0.848 |   0.127 |   0.997 |   0.004 |   0.725 |   0.064 |
| cnn_news               | 12362 |            3773 |            5485 |            3104 |      0.956 |         0.996 |       0.956 |       0.017 |   0.937 |   0.024 |   1     |   0.001 |   0.932 |   0.025 |
| blog_authorship_corpus | 12270 |            4032 |            4754 |            3484 |      0.892 |         0.983 |       0.894 |       0.049 |   0.813 |   0.048 |   0.985 |   0.022 |   0.883 |   0.077 |
| house_of_commons       | 10001 |            2590 |            4052 |            3359 |      0.834 |         0.963 |       0.833 |       0.069 |   0.678 |   0.081 |   1     |   0.003 |   0.82  |   0.124 |
| arxiv_papers           |  7844 |            1628 |            3113 |            3103 |      0.596 |         0.869 |       0.663 |       0.117 |   0.003 |   0.003 |   0.997 |   0.003 |   0.989 |   0.344 |
| euro_court_cases       |  7435 |            1829 |            2811 |            2795 |      0.81  |         0.951 |       0.811 |       0.083 |   0.669 |   0.117 |   1     |   0     |   0.765 |   0.13  |
| gutenberg              |  3055 |             478 |            1876 |             701 |      0.959 |         0.996 |       0.958 |       0.013 |   0.941 |   0.013 |   0.996 |   0.014 |   0.936 |   0.013 |

## Per-Agent Results

---

### RoBERTa+Finetune — Per Agent

| group            |     n |   count_label_0 |   count_label_1 |   count_label_2 |   F1_macro | AUROC_macro   |   TPR_macro |   FPR_macro |   TPR_0 |   FPR_0 |   TPR_1 |   FPR_1 |   TPR_2 |   FPR_2 |
|:-----------------|------:|----------------:|----------------:|----------------:|-----------:|:--------------|------------:|------------:|--------:|--------:|--------:|--------:|--------:|--------:|
| gemma2:9b        | 40289 |               0 |           21645 |           18644 |      0.639 |               |       0.618 |       0.024 |   0     |   0.063 |   0.996 |   0.006 |   0.859 |   0.002 |
| gpt-4o-mini      | 33489 |               0 |           18627 |           14862 |      0.649 |               |       0.635 |       0.015 |   0     |   0.041 |   1     |   0.003 |   0.904 |   0     |
| human            | 29801 |           29801 |               0 |               0 |      0.296 |               |       0.266 |       0.068 |   0.797 |   0     |   0     |   0.006 |   0     |   0.197 |
| phi3:3.8b        |  2601 |               0 |            1402 |            1199 |      0.64  |               |       0.623 |       0.024 |   0     |   0.05  |   0.991 |   0.019 |   0.878 |   0.004 |
| nemotron         |   920 |               0 |             560 |             360 |      0.654 |               |       0.642 |       0.01  |   0     |   0.028 |   0.996 |   0     |   0.931 |   0.002 |
| deepseek-r1:1.5b |   899 |               0 |             500 |             399 |      0.637 |               |       0.62  |       0.026 |   0     |   0.049 |   0.99  |   0.023 |   0.87  |   0.008 |
| deepseek-r1:32b  |   633 |               0 |             326 |             307 |      0.641 |               |       0.623 |       0.023 |   0     |   0.057 |   0.997 |   0.01  |   0.873 |   0.003 |
| o3-mini          |   474 |               0 |             259 |             215 |      0.608 |               |       0.57  |       0.047 |   0     |   0.127 |   0.992 |   0.009 |   0.716 |   0.004 |
| gpt-4-turbo      |   186 |               0 |              99 |              87 |      0.653 |               |       0.64  |       0.013 |   0     |   0.038 |   1     |   0     |   0.92  |   0     |