# Luminar

In [None]:
import warnings

warnings.filterwarnings("ignore", message=r".*Please note that with a fast tokenizer.*")
warnings.filterwarnings(
    "ignore",
    message=r".*Using the `WANDB_DISABLED` environment variable is deprecated.*",
)
warnings.filterwarnings(
    "ignore",
    message=r".*Was asked to gather along dimension \d+, but all input tensors were scalars.*",
)

In [None]:
import gc
import json
from pathlib import Path
from typing import TypedDict

import evaluate
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from numpy.typing import NDArray
from tqdm.auto import tqdm
from transformers.trainer_utils import EvalPrediction

## Baselines: Neural Network Models

### Setup

In [None]:
from abc import ABC, abstractmethod

import torch
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.tokenization_utils_base import BatchEncoding

from luminar.utils import compute_metrics


class PredictionResults(TypedDict):
    prediction: list[float]


class DetectorABC(ABC):
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
        device: str | torch.device = ("cuda" if torch.cuda.is_available() else "cpu"),
    ) -> None:
        super().__init__()
        self.device = torch.device(device)
        self.tokenizer = tokenizer

    @abstractmethod
    def tokenize(self, texts: list[str]) -> BatchEncoding: ...

    @abstractmethod
    def process(self, inputs: dict) -> PredictionResults: ...


def run_detector(
    detector,
    datasets: dict[str, DatasetDict],
    batch_size=32,
    threshold: float = 0.5,
    sigmoid: bool = True,
    greater: bool = True,
):
    scores = {}
    for config_name, dataset in tqdm(datasets.items(), desc="Predicting on Datasets"):
        dataset: Dataset = dataset["test"].map(
            detector.tokenize,
            input_columns=["text"],
            batched=True,
            batch_size=1024,
            desc="Tokenizing",
        )
        dataset = dataset.sort("length")

        labels = []
        predictions = []
        for batch in tqdm(
            dataset.batch(batch_size), desc=f"Processing {config_name}", position=1
        ):
            labels.extend(batch["labels"])
            predictions.extend(detector.process(batch)["prediction"])

        scores[config_name] = compute_metrics(
            (np.array(predictions), np.array(labels)),
            threshold=threshold,
            sigmoid=sigmoid,
            greater=greater,
        )
    return scores


def run_detector_tokenized(
    detector,
    datasets: dict[str, DatasetDict],
    batch_size=32,
    threshold: float = 0.5,
    sigmoid: bool = True,
    greater: bool = True,
):
    scores = {}
    for config_name, dataset in tqdm(datasets.items(), desc="Predicting on Datasets"):
        labels = []
        predictions = []
        for batch in tqdm(
            dataset["test"].batch(batch_size),
            desc=f"Processing {config_name}",
            position=1,
        ):
            labels.extend(batch["labels"])
            predictions.extend(detector.process(batch)["prediction"])

        scores[config_name] = compute_metrics(
            (np.array(predictions), np.array(labels)),
            threshold=threshold,
            sigmoid=sigmoid,
            greater=greater,
        )
    return scores


## Data

In [4]:
from pathlib import Path

from luminar.utils import get_matched_datasets

HF_TOKEN = (Path.home() / ".hf_token").read_text().strip()

agent = "gpt_4o_mini"
other_agents = "gemma2_9b"
datasets = {}
num_proc = 32
for domain in tqdm(
    [
        "blog_authorship_corpus",
        "student_essays",
        "cnn_news",
        "euro_court_cases",
        "house_of_commons",
        "arxiv_papers",
        "gutenberg_en",
        "bundestag",
        "spiegel_articles",
        # "gutenberg_de",
        "en",
        "de",
    ]
):
    datset_config_name = f"{domain}-fulltext"
    dataset_split_name = f"human+{agent}+{other_agents}"
    dataset: Dataset = (
        load_dataset(
            "liberi-luminaris/PrismAI",
            datset_config_name,
            split=dataset_split_name,
            token=HF_TOKEN,
        )  # type: ignore
        .rename_column("label", "labels")
        .filter(
            lambda text: len(text.strip()) > 0,
            input_columns=["text"],
            num_proc=num_proc,
        )
    )
    datasets_matched, dataset_unmatched = get_matched_datasets(
        dataset, agent, num_proc=num_proc
    )
    datasets_matched["unmatched"] = dataset_unmatched
    datasets[domain] = datasets_matched
del dataset
datasets

{'blog_authorship_corpus': DatasetDict({
     train: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 6406
     })
     eval: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 914
     })
     test: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 1832
     })
     unmatched: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 24280
     })
 }),
 'student_essays': DatasetDict({
     train: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 50734
     })
     eval: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 7248
     })
     test: Dataset({
         features: ['agent', 'id_sample', 'id_source', 'labels', 'text'],
         num_rows: 14496
     })
     unmatched: Dataset

### RoBERTa

In [19]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/chatgpt_roberta_detector/chatgpt_detector.py
import traceback

import torch
from datasets import DatasetDict
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.tokenization_utils_base import BatchEncoding

accuracy = evaluate.load("accuracy")


def compute_metrics_acc(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


class RoBERTaClassifier(DetectorABC):
    def __init__(
        self,
        model_name="roberta-base",
        tokenizer_name=None,
        device="cuda" if torch.cuda.is_available() else "cpu",
    ):
        super().__init__(
            AutoTokenizer.from_pretrained(tokenizer_name or model_name),
            device=device,
        )
        self.device = torch.device(device)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model = self.model.to(self.device)
        self.model.eval()

    def reset(self):
        model_name = self.model.name_or_path
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model = self.model.to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        probs = outputs.logits
        return probs[:, 1].detach().cpu().flatten().tolist()

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    def train(
        self,
        dataset: DatasetDict,
        training_args: TrainingArguments,
        save_path: str | Path | None = None,
    ):
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        trainer = Trainer(
            self.model,
            training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["eval"],
            data_collator=data_collator,
            compute_metrics=compute_metrics_acc,  # type: ignore
        )

        trainer.train()
        if save_path:
            try:
                trainer.save_model(str(save_path))
            except Exception:
                traceback.print_exc()

        self.model = trainer.model.to(self.device)

        del trainer

    @torch.inference_mode()
    def process_texts(self, texts: list[str]) -> list[float]:
        encoding = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(self.device)
        outputs = self.model(**encoding)
        probs = outputs.logits
        return probs[:, 1].detach().cpu().flatten().tolist()

In [15]:
def run_roberta(model_name: str):
    detector = RoBERTaClassifier(model_name, device="cuda:0")
    try:
        return run_detector(detector, datasets)
    finally:
        detector.model.to("cpu")
        del detector
        gc.collect()
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        gc.collect()


scores_roberta = run_roberta("Hello-SimpleAI/chatgpt-detector-roberta")
print(json.dumps(scores_roberta, indent=4))
with open("../logs/roberta-Hello-SimpleAI.json", "w") as fp:
    json.dump(scores_roberta, fp, indent=4)

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/1832 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/1832 [00:00<?, ? examples/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/1506 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/1506 [00:00<?, ? examples/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/2386 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2386 [00:00<?, ? examples/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/2870 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2870 [00:00<?, ? examples/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/594 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/594 [00:00<?, ? examples/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

{
    "blog_authorship_corpus": {
        "n_samples": 1832,
        "f1_score": 0.6784643164114139,
        "precision": 0.7889491629184345,
        "recall": 0.7025109170305677,
        "accuracy": 0.7025109170305677,
        "roc_auc": 0.7025109170305676,
        "fpr": 0.024017467248908297,
        "tpr": 0.42903930131004364,
        "f1_human": 0.7663951993141878,
        "f1_ai": 0.5905334335086401,
        "f1_score_median": 0.8231441048034934,
        "precision_median": 0.8231441048034934,
        "recall_median": 0.8231441048034934,
        "accuracy_median": 0.8231441048034934,
        "roc_auc_median": 0.8231441048034934,
        "fpr_median": 0.17685589519650655,
        "tpr_median": 0.8231441048034934,
        "f1_human_median": 0.8231441048034934,
        "f1_ai_median": 0.8231441048034934,
        "threshold_median": 0.11454159047796125,
        "f1_score_mean": 0.7693066743268053,
        "precision_mean": 0.8177356786601261,
        "recall_mean": 0.7767467248908297,

In [None]:
def run_roberta_ft(model_name="roberta-base", only_eval=False, cross_eval=False):
    model = RoBERTaClassifier(model_name, device="cuda:0")

    try:
        model_str = model_name.replace("/", "--").replace(":", "--")
        output_path = Path("../models/roberta-ft/") / model_str

        logs_path = Path("../logs/roberta-ft/") / model_str
        logs_path.mkdir(parents=True, exist_ok=True)

        datasets_tokenized = {
            config: dataset.map(
                model.tokenize,
                input_columns=["text"],
                batched=True,
                batch_size=1024,
                desc="Tokenizing",
            ).sort("length")
            for config, dataset in (
                datasets.items()
                if not only_eval
                else (
                    # We can omit training & dev splits if we are only evaluating
                    (config, DatasetDict({"test": dataset["test"]}))
                    for config, dataset in datasets.items()
                )
            )
        }

        tq = tqdm(
            datasets_tokenized.items(),
            desc="Evaluating" if only_eval else "Finetuning",
        )
        scores_roberta_ft = {}
        for config, dataset in tq:
            tq.set_postfix_str(config)

            output_dir = output_path / config
            final_model_path = output_dir / "final"
            if not only_eval:
                model.reset()
                training_args = TrainingArguments(
                    output_dir=str(output_dir),
                    seed=42,
                    #
                    learning_rate=1e-5,
                    num_train_epochs=1,
                    #
                    per_device_train_batch_size=15,
                    per_device_eval_batch_size=30,
                    #
                    logging_steps=50,
                    logging_strategy="steps",
                    eval_steps=50,
                    eval_strategy="steps",
                    save_strategy="epoch",
                    save_total_limit=2,
                )
                model.train(
                    dataset,
                    training_args,
                    save_path=str(final_model_path),
                )
            else:
                model.model.to("cpu")
                del model
                model = RoBERTaClassifier(
                    final_model_path, tokenizer_name=model_name, device="cuda:0"
                )

            scores_roberta_ft[config] = run_detector_tokenized(
                model,
                datasets_tokenized if cross_eval else {config: dataset},
            )
            with (logs_path / f"{config}.json").open("w") as fp:
                json.dump(scores_roberta_ft[config], fp, indent=4)

        return scores_roberta_ft
    finally:
        model.model.to("cpu")
        del model
        gc.collect()
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        gc.collect()


In [None]:
scores_roberta_base_ft = run_roberta_ft("roberta-base", only_eval=True, cross_eval=True)
print(json.dumps(scores_roberta_base_ft, indent=4))
with open("../logs/roberta-ft/roberta-base-ft.json", "w") as fp:
    json.dump(scores_roberta_base_ft, fp, indent=4)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing:   0%|          | 0/1832 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1506 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2386 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2870 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/594 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Evaluating:   0%|          | 0/11 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/1832 [00:00<?, ? examples/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/1506 [00:00<?, ? examples/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/2386 [00:00<?, ? examples/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/2870 [00:00<?, ? examples/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/594 [00:00<?, ? examples/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

{
    "blog_authorship_corpus": {
        "blog_authorship_corpus": {
            "n_samples": 1832,
            "f1_score": 0.980893551275576,
            "precision": 0.9810608905691329,
            "recall": 0.9808951965065502,
            "accuracy": 0.9808951965065502,
            "roc_auc": 0.9808951965065502,
            "fpr": 0.028384279475982533,
            "tpr": 0.990174672489083,
            "f1_human": 0.9807162534435262,
            "f1_ai": 0.9810708491076258,
            "f1_score_median": 0.9847161572052402,
            "precision_median": 0.9847161572052402,
            "recall_median": 0.9847161572052402,
            "accuracy_median": 0.9847161572052402,
            "roc_auc_median": 0.9847161572052402,
            "fpr_median": 0.015283842794759825,
            "tpr_median": 0.9847161572052402,
            "f1_human_median": 0.9847161572052402,
            "f1_ai_median": 0.9847161572052402,
            "threshold_median": 0.832700160697793,
            "f1_score

### RADAR

In [7]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/radar/radar.py

import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class Radar(DetectorABC):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__(
            AutoTokenizer.from_pretrained("TrustSafeAI/RADAR-Vicuna-7B"),
            device=device,
        )
        self.device = torch.device(device)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "TrustSafeAI/RADAR-Vicuna-7B",
        )
        self.model.eval()
        self.model.to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 1].exp().tolist()
        return output_probs

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def process_texts(self, texts: list[str]) -> list[float]:
        encoding = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 1].exp().tolist()
        return output_probs

In [8]:
def run_radar():
    detector = Radar(device="cuda")
    try:
        return run_detector(detector, datasets, sigmoid=False)
    finally:
        detector.model.to("cpu")
        del detector
        gc.collect()
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        gc.collect()


scores_radar = run_radar()
print(json.dumps(scores_radar, indent=4))
with open("../logs/radar.json", "w") as fp:
    json.dump(scores_radar, fp, indent=4)

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/1832 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/1832 [00:00<?, ? examples/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/1506 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/1506 [00:00<?, ? examples/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/2386 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2386 [00:00<?, ? examples/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/2870 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2870 [00:00<?, ? examples/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/594 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/594 [00:00<?, ? examples/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

{
    "blog_authorship_corpus": {
        "n_samples": 1832,
        "f1_score": 0.4128822786708428,
        "precision": 0.4205433211175738,
        "recall": 0.42958515283842796,
        "accuracy": 0.42958515283842796,
        "roc_auc": 0.4295851528384279,
        "fpr": 0.4017467248908297,
        "tpr": 0.2609170305676856,
        "f1_human": 0.5119103222793088,
        "f1_ai": 0.31385423506237686,
        "f1_score_median": 0.4606986899563319,
        "precision_median": 0.4606986899563319,
        "recall_median": 0.4606986899563319,
        "accuracy_median": 0.4606986899563319,
        "roc_auc_median": 0.4606986899563319,
        "fpr_median": 0.5393013100436681,
        "tpr_median": 0.4606986899563319,
        "f1_human_median": 0.4606986899563319,
        "f1_ai_median": 0.4606986899563319,
        "threshold_median": 0.036310071125626564,
        "f1_score_mean": 0.4254277580222148,
        "precision_mean": 0.4303833120233188,
        "recall_mean": 0.43504366812227074

### Binoculars

In [9]:
# Source: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/binoculars/utils/metrics.py

ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)


def perplexity(
    encoding: BatchEncoding,
    logits: torch.Tensor,
    median: bool = False,
    temperature: float = 1.0,
):
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()

    if median:
        ce_nan = ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).masked_fill(
            ~shifted_attention_mask.bool(), float("nan")
        )
        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)

    else:
        ppl = (
            ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels)
            * shifted_attention_mask
        ).sum(1) / shifted_attention_mask.sum(1)
        ppl = ppl.to("cpu").float().numpy()

    return ppl


def entropy(
    p_logits: torch.Tensor,
    q_logits: torch.Tensor,
    encoding: BatchEncoding,
    pad_token_id: int,
    median: bool = False,
    sample_p: bool = False,
    temperature: float = 1.0,
):
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]

    if not temperature:
        p_scores, q_scores = p_logits, q_logits
    else:
        p_scores, q_scores = p_logits / temperature, q_logits / temperature

    p_proba = softmax_fn(p_scores).view(-1, vocab_size)

    if sample_p:
        p_proba = torch.multinomial(
            p_proba.view(-1, vocab_size), replacement=True, num_samples=1
        ).view(-1)

    q_scores = q_scores.view(-1, vocab_size)

    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)

    if median:
        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        agg_ce = (
            ((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy()
        )

    return agg_ce

In [12]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/binoculars/binoculars.py

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_grad_enabled(False)

GLOBAL_BINOCULARS_THRESHOLD = (
    0.9015310749276843  # selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
)
DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1


class Binoculars(DetectorABC):
    def __init__(
        self,
        observer_name_or_path: str = "tiiuae/falcon-7b",
        performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
        use_bfloat16: bool = True,
        max_token_observed: int = 512,
    ) -> None:
        super().__init__(AutoTokenizer.from_pretrained(observer_name_or_path))
        self.observer_model = AutoModelForCausalLM.from_pretrained(
            observer_name_or_path,
            device_map={"": DEVICE_1},
            trust_remote_code=True,
            torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32,
        )
        self.performer_model = AutoModelForCausalLM.from_pretrained(
            performer_name_or_path,
            device_map={"": DEVICE_2},
            trust_remote_code=True,
            torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32,
        )

        self.observer_model.eval()
        self.performer_model.eval()

        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.max_token_observed = max_token_observed

        tokenizer = self.tokenizer

        def _tokenize(texts: list[str]) -> BatchEncoding:
            return tokenizer(
                texts,
                padding=False,
                truncation=True,
                max_length=max_token_observed,
                return_length=True,
                return_token_type_ids=False,
            )

        self.tokenize = _tokenize

    def tokenize(self):
        pass

    @torch.inference_mode()
    def _get_logits(
        self, encodings: BatchEncoding
    ) -> tuple[torch.Tensor, torch.Tensor]:
        observer_logits = self.observer_model(
            **encodings.to(self.observer_model.device)
        ).logits
        performer_logits = self.performer_model(
            **encodings.to(self.performer_model.device)
        ).logits
        if DEVICE_1 != "cpu":
            torch.cuda.synchronize()
        return observer_logits, performer_logits

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encodings = self.tokenizer.pad(inputs, return_tensors="pt")
        observer_logits, performer_logits = self._get_logits(encodings)
        ppl = perplexity(encodings, performer_logits)
        x_ppl = entropy(
            observer_logits.to(DEVICE_1),
            performer_logits.to(DEVICE_1),
            encodings.to(DEVICE_1),
            self.tokenizer.pad_token_id,  # type: ignore
        )
        binoculars_scores = ppl / x_ppl
        return binoculars_scores.tolist()

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def process_texts(self, texts: list[str]) -> list[float]:
        encodings = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.max_token_observed,
            return_token_type_ids=False,
        ).to(self.device)
        observer_logits, performer_logits = self._get_logits(encodings)
        ppl = perplexity(encodings, performer_logits)
        x_ppl = entropy(
            observer_logits.to(DEVICE_1),
            performer_logits.to(DEVICE_1),
            encodings.to(DEVICE_1),
            self.tokenizer.pad_token_id,  # type: ignore
        )
        binoculars_scores = ppl / x_ppl
        return binoculars_scores.tolist()

In [13]:
def run_binoculars():
    detector = Binoculars("tiiuae/falcon-7b", "tiiuae/falcon-7b-instruct")
    try:
        return run_detector(
            detector,
            datasets,
            batch_size=16,
            threshold=GLOBAL_BINOCULARS_THRESHOLD,
            sigmoid=False,
            greater=False,
        )
    finally:
        detector.observer_model.to("cpu")
        detector.performer_model.to("cpu")
        del detector
        gc.collect()
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        gc.collect()


scores_binoculars = run_binoculars()
print(json.dumps(scores_binoculars, indent=4))
with open("../logs/binoculars.json", "w") as fp:
    json.dump(scores_binoculars, fp, indent=4)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/115 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Processing student_essays:   0%|          | 0/906 [00:00<?, ?it/s]

Processing cnn_news:   0%|          | 0/296 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/95 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/150 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/180 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/38 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Processing bundestag:   0%|          | 0/252 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Processing spiegel_articles:   0%|          | 0/307 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Processing en:   0%|          | 0/1776 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Processing de:   0%|          | 0/561 [00:00<?, ?it/s]

{
    "blog_authorship_corpus": {
        "n_samples": 1832,
        "f1_score": 0.8228650990099009,
        "precision": 0.844482584553256,
        "recall": 0.8253275109170306,
        "accuracy": 0.8253275109170306,
        "roc_auc": 0.8253275109170306,
        "fpr": 0.056768558951965066,
        "tpr": 0.7074235807860262,
        "f1_human": 0.84375,
        "f1_ai": 0.801980198019802,
        "f1_score_median": 0.8165938864628821,
        "precision_median": 0.8165938864628821,
        "recall_median": 0.8165938864628821,
        "accuracy_median": 0.8165938864628821,
        "roc_auc_median": 0.8165938864628821,
        "fpr_median": 0.18340611353711792,
        "tpr_median": 0.8165938864628821,
        "f1_human_median": 0.8165938864628821,
        "f1_ai_median": 0.8165938864628821,
        "threshold_median": 0.9553535878658295,
        "f1_score_mean": 0.8314709968925882,
        "precision_mean": 0.8351128098238656,
        "recall_mean": 0.8318777292576419,
        "accur

### E5-Small LoRA


In [5]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/radar/radar.py

import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class E5Lora(DetectorABC):
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        super().__init__(
            AutoTokenizer.from_pretrained(
                "MayZhou/e5-small-lora-ai-generated-detector"
            ),
            device=device,
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "MayZhou/e5-small-lora-ai-generated-detector"
        )
        self.model.eval()
        self.model.to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 1].exp().tolist()
        return output_probs

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def process_texts(self, texts: list[str]) -> list[float]:
        encoding = self.tokenizer(
            texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 1].exp().tolist()
        return output_probs

In [6]:
def run_e5_small_lora():
    detector = E5Lora(device="cuda:0")
    try:
        return run_detector(detector, datasets, sigmoid=False)
    finally:
        detector.model.to("cpu")
        del detector
        gc.collect()
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        gc.collect()


scores_e5 = run_e5_small_lora()

print(json.dumps(scores_e5, indent=4))
with open("../logs/e5-small-lora.json", "w") as fp:
    json.dump(scores_e5, fp, indent=4)

Predicting on Datasets:   0%|          | 0/11 [00:00<?, ?it/s]

Processing blog_authorship_corpus:   0%|          | 0/58 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Processing student_essays:   0%|          | 0/453 [00:00<?, ?it/s]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Processing cnn_news:   0%|          | 0/148 [00:00<?, ?it/s]

Processing euro_court_cases:   0%|          | 0/48 [00:00<?, ?it/s]

Processing house_of_commons:   0%|          | 0/75 [00:00<?, ?it/s]

Processing arxiv_papers:   0%|          | 0/90 [00:00<?, ?it/s]

Processing gutenberg_en:   0%|          | 0/19 [00:00<?, ?it/s]

Processing bundestag:   0%|          | 0/126 [00:00<?, ?it/s]

Processing spiegel_articles:   0%|          | 0/154 [00:00<?, ?it/s]

Processing en:   0%|          | 0/888 [00:00<?, ?it/s]

Processing de:   0%|          | 0/281 [00:00<?, ?it/s]

{
    "blog_authorship_corpus": {
        "n_samples": 1832,
        "f1_score": 0.623582435791218,
        "precision": 0.7709560851594541,
        "recall": 0.6615720524017468,
        "accuracy": 0.6615720524017468,
        "roc_auc": 0.6615720524017468,
        "fpr": 0.6561135371179039,
        "tpr": 0.9792576419213974,
        "f1_human": 0.504,
        "f1_ai": 0.7431648715824358,
        "f1_score_median": 0.7620087336244541,
        "precision_median": 0.7620087336244541,
        "recall_median": 0.7620087336244541,
        "accuracy_median": 0.7620087336244541,
        "roc_auc_median": 0.762008733624454,
        "fpr_median": 0.23799126637554585,
        "tpr_median": 0.7620087336244541,
        "f1_human_median": 0.7620087336244541,
        "f1_ai_median": 0.7620087336244541,
        "threshold_median": 0.8260228335857391,
        "f1_score_mean": 0.7437549397940648,
        "precision_mean": 0.7770039352402083,
        "recall_mean": 0.75,
        "accuracy_mean": 0.75,
 

In [33]:
domains = [
    "Web Blogs",
    "Essays",
    "CNN",
    "ECHR",
    "HoC",
    "arXiv",
    "Gutenberg$_{en}$",
    "Bundestag$_{de}$",
    "Spiegel$_{de}$",
    "Gutenberg$_{de}$",
    "All$_{en}$",
    "All$_{de}$",
]

name_map = {
    "blog_authorship_corpus": "Web Blogs",
    "student_essays": "Essays",
    "cnn_news": "CNN",
    "euro_court_cases": "ECHR",
    "house_of_commons": "HoC",
    "arxiv_papers": "arXiv",
    "gutenberg_en": "Gutenberg$_{en}$",
    "bundestag": "Bundestag$_{de}$",
    "spiegel_articles": "Spiegel$_{de}$",
    # "gutenberg_de": "Gutenberg$_{de}$",
    "en": "All$_{en}$",
    "de": "All$_{de}$",
}

In [34]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

results = defaultdict(dict)
for logs_path in Path("../logs/").iterdir():
    if logs_path.suffix == ".json":
        with logs_path.open("r") as fp:
            data = json.load(fp)
        model_name = logs_path.stem
        for domain, scores in data.items():
            results[name_map[domain]].update(
                {
                    model_name + "_f1_score": scores["f1_score"],
                    model_name + "_accuracy": scores["accuracy"],
                    model_name + "_roc_auc": scores["roc_auc"],
                }
            )

model_name = "roberta-base-ft"
for domain, name in name_map.items():
    logs_path = Path("../logs/roberta-ft/roberta-base/") / (domain + ".json")
    with (logs_path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1_score": data[domain]["f1_score"],
            model_name + "_accuracy": data[domain]["accuracy"],
            model_name + "_roc_auc": data[domain]["roc_auc"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df

\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
 & binoculars_f1_score & binoculars_accuracy & binoculars_roc_auc & e5-small-lora_f1_score & e5-small-lora_accuracy & e5-small-lora_roc_auc & radar_f1_score & radar_accuracy & radar_roc_auc & roberta-Hello-SimpleAI_f1_score & roberta-Hello-SimpleAI_accuracy & roberta-Hello-SimpleAI_roc_auc & roberta-base-ft_f1_score & roberta-base-ft_accuracy & roberta-base-ft_roc_auc \\
domain &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
Web Blogs & 0.823 & 0.825 & 0.825 & 0.624 & 0.662 & 0.662 & 0.413 & 0.430 & 0.430 & 0.678 & 0.703 & 0.703 & 0.981 & 0.981 & 0.981 \\
Essays & 0.991 & 0.991 & 0.991 & 0.511 & 0.591 & 0.591 & 0.299 & 0.327 & 0.327 & 0.610 & 0.628 & 0.628 & 0.999 & 0.999 & 0.999 \\
CNN & 0.992 & 0.992 & 0.992 & 0.596 & 0.646 & 0.646 & 0.201 & 0.242 & 0.242 & 0.649 & 0.676 & 0.676 & 0.999 & 0.999 & 0.999 \\
ECHR & 0.945 & 0.946 & 0.946 & 0.372 & 0.518 & 0.518 & 0.416 & 0.420 & 0.420 & 0.445 & 0.555 & 0.555 & 1.000 & 1.000 & 1.000 \\
H

Unnamed: 0_level_0,binoculars_f1_score,binoculars_accuracy,binoculars_roc_auc,e5-small-lora_f1_score,e5-small-lora_accuracy,e5-small-lora_roc_auc,radar_f1_score,radar_accuracy,radar_roc_auc,roberta-Hello-SimpleAI_f1_score,roberta-Hello-SimpleAI_accuracy,roberta-Hello-SimpleAI_roc_auc,roberta-base-ft_f1_score,roberta-base-ft_accuracy,roberta-base-ft_roc_auc
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Web Blogs,0.822865,0.825328,0.825328,0.623582,0.661572,0.661572,0.412882,0.429585,0.429585,0.678464,0.702511,0.702511,0.980894,0.980895,0.980895
Essays,0.991445,0.991446,0.991446,0.510591,0.591336,0.591336,0.299054,0.327263,0.327263,0.609554,0.627897,0.627897,0.998896,0.998896,0.998896
CNN,0.991536,0.991536,0.991536,0.596457,0.646424,0.646424,0.20099,0.241642,0.241642,0.64876,0.675624,0.675624,0.999365,0.999365,0.999365
ECHR,0.945397,0.945551,0.945551,0.371981,0.517928,0.517928,0.416427,0.419655,0.419655,0.44533,0.555113,0.555113,1.0,1.0,1.0
HoC,0.980717,0.980721,0.980721,0.822936,0.828164,0.828164,0.158682,0.158843,0.158843,0.544177,0.611065,0.611065,0.997904,0.997904,0.997904
arXiv,0.996516,0.996516,0.996516,0.517223,0.595819,0.595819,0.273349,0.365854,0.365854,0.41122,0.537282,0.537282,0.998955,0.998955,0.998955
Gutenberg$_{en}$,0.991582,0.991582,0.991582,0.935817,0.936027,0.936027,0.269943,0.316498,0.316498,0.463553,0.563973,0.563973,0.993266,0.993266,0.993266
Bundestag$_{de}$,0.955699,0.955765,0.955765,0.377365,0.438121,0.438121,0.372983,0.504225,0.504225,0.333333,0.5,0.5,0.996024,0.996024,0.996024
Spiegel$_{de}$,0.907124,0.907793,0.907793,0.622337,0.630967,0.630967,0.335255,0.496328,0.496328,0.334692,0.500612,0.500612,0.977345,0.977356,0.977356
All$_{en}$,0.978102,0.978102,0.978102,0.568783,0.627834,0.627834,0.289779,0.309675,0.309675,0.591458,0.624313,0.624313,0.998169,0.998169,0.998169


In [35]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

results = defaultdict(dict)
for logs_path in Path("../logs/").iterdir():
    if logs_path.suffix == ".json":
        with logs_path.open("r") as fp:
            data = json.load(fp)
        model_name = logs_path.stem
        for domain, scores in data.items():
            results[name_map[domain]].update(
                {
                    model_name + "_f1_score": scores["f1_score_median"],
                    model_name + "_accuracy": scores["accuracy_median"],
                    model_name + "_roc_auc": scores["roc_auc_median"],
                }
            )

model_name = "roberta-base-ft"
for domain, name in name_map.items():
    logs_path = Path("../logs/roberta-ft/roberta-base/") / (domain + ".json")
    with (logs_path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1_score": data[domain]["f1_score_median"],
            model_name + "_accuracy": data[domain]["accuracy_median"],
            model_name + "_roc_auc": data[domain]["roc_auc_median"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df

\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
 & binoculars_f1_score & binoculars_accuracy & binoculars_roc_auc & e5-small-lora_f1_score & e5-small-lora_accuracy & e5-small-lora_roc_auc & radar_f1_score & radar_accuracy & radar_roc_auc & roberta-Hello-SimpleAI_f1_score & roberta-Hello-SimpleAI_accuracy & roberta-Hello-SimpleAI_roc_auc & roberta-base-ft_f1_score & roberta-base-ft_accuracy & roberta-base-ft_roc_auc \\
domain &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
Web Blogs & 0.817 & 0.817 & 0.817 & 0.762 & 0.762 & 0.762 & 0.461 & 0.461 & 0.461 & 0.823 & 0.823 & 0.823 & 0.985 & 0.985 & 0.985 \\
Essays & 0.997 & 0.997 & 0.997 & 0.787 & 0.787 & 0.787 & 0.260 & 0.260 & 0.260 & 0.683 & 0.683 & 0.683 & 1.000 & 1.000 & 1.000 \\
CNN & 0.997 & 0.997 & 0.997 & 0.940 & 0.940 & 0.940 & 0.081 & 0.081 & 0.081 & 0.865 & 0.865 & 0.865 & 1.000 & 1.000 & 1.000 \\
ECHR & 0.989 & 0.989 & 0.989 & 0.919 & 0.919 & 0.919 & 0.405 & 0.405 & 0.405 & 0.757 & 0.757 & 0.757 & 1.000 & 1.000 & 1.000 \\
H

Unnamed: 0_level_0,binoculars_f1_score,binoculars_accuracy,binoculars_roc_auc,e5-small-lora_f1_score,e5-small-lora_accuracy,e5-small-lora_roc_auc,radar_f1_score,radar_accuracy,radar_roc_auc,roberta-Hello-SimpleAI_f1_score,roberta-Hello-SimpleAI_accuracy,roberta-Hello-SimpleAI_roc_auc,roberta-base-ft_f1_score,roberta-base-ft_accuracy,roberta-base-ft_roc_auc
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Web Blogs,0.816594,0.816594,0.816594,0.762009,0.762009,0.762009,0.460699,0.460699,0.460699,0.823144,0.823144,0.823144,0.984716,0.984716,0.984716
Essays,0.996551,0.996551,0.996551,0.786562,0.786562,0.786562,0.260348,0.260348,0.260348,0.682671,0.682671,0.682671,1.0,1.0,1.0
CNN,0.996614,0.996614,0.996614,0.94033,0.94033,0.94033,0.081253,0.081253,0.081253,0.864579,0.864579,0.864579,0.999788,0.999788,0.999788
ECHR,0.989376,0.989376,0.989376,0.918991,0.918991,0.918991,0.405046,0.405046,0.405046,0.756972,0.756972,0.756972,1.0,1.0,1.0
HoC,0.970243,0.970243,0.970243,0.966471,0.966471,0.966471,0.16513,0.16513,0.16513,0.857502,0.857502,0.857502,0.999162,0.999162,0.999162
arXiv,0.996516,0.996516,0.996516,0.935192,0.935192,0.935192,0.121254,0.121254,0.121254,0.802091,0.802091,0.802091,0.999303,0.999303,0.999303
Gutenberg$_{en}$,0.993266,0.993266,0.993266,0.969697,0.969697,0.969697,0.191919,0.191919,0.191919,0.878788,0.878788,0.878788,0.993266,0.993266,0.993266
Bundestag$_{de}$,0.963718,0.963718,0.963718,0.387177,0.387177,0.387177,0.553678,0.553678,0.553678,0.697316,0.697316,0.697316,0.996521,0.996521,0.996521
Spiegel$_{de}$,0.954712,0.954712,0.954712,0.651979,0.651979,0.651979,0.383925,0.383925,0.383925,0.5561,0.5561,0.5561,0.988984,0.988984,0.988984
All$_{en}$,0.978066,0.978066,0.978066,0.83967,0.83967,0.83967,0.242712,0.242712,0.242712,0.72518,0.72518,0.72518,0.998733,0.998733,0.998733


In [36]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

results = defaultdict(dict)
for logs_path in Path("../logs/").iterdir():
    if logs_path.suffix == ".json":
        with logs_path.open("r") as fp:
            data = json.load(fp)
        model_name = logs_path.stem
        for domain, scores in data.items():
            results[name_map[domain]].update(
                {
                    model_name + "_f1_score": scores["f1_score_mean"],
                    model_name + "_accuracy": scores["accuracy_mean"],
                    model_name + "_roc_auc": scores["roc_auc_mean"],
                }
            )

model_name = "roberta-base-ft"
for domain, name in name_map.items():
    logs_path = Path("../logs/roberta-ft/roberta-base/") / (domain + ".json")
    with (logs_path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1_score": data[domain]["f1_score_mean"],
            model_name + "_accuracy": data[domain]["accuracy_mean"],
            model_name + "_roc_auc": data[domain]["roc_auc_mean"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df

\begin{tabular}{lrrrrrrrrrrrrrrr}
\toprule
 & binoculars_f1_score & binoculars_accuracy & binoculars_roc_auc & e5-small-lora_f1_score & e5-small-lora_accuracy & e5-small-lora_roc_auc & radar_f1_score & radar_accuracy & radar_roc_auc & roberta-Hello-SimpleAI_f1_score & roberta-Hello-SimpleAI_accuracy & roberta-Hello-SimpleAI_roc_auc & roberta-base-ft_f1_score & roberta-base-ft_accuracy & roberta-base-ft_roc_auc \\
domain &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
Web Blogs & 0.831 & 0.832 & 0.832 & 0.744 & 0.750 & 0.750 & 0.425 & 0.435 & 0.435 & 0.769 & 0.777 & 0.777 & 0.981 & 0.981 & 0.981 \\
Essays & 0.997 & 0.997 & 0.997 & 0.715 & 0.732 & 0.732 & 0.276 & 0.282 & 0.282 & 0.636 & 0.646 & 0.646 & 0.999 & 0.999 & 0.999 \\
CNN & 0.995 & 0.995 & 0.995 & 0.805 & 0.812 & 0.812 & 0.128 & 0.136 & 0.136 & 0.763 & 0.768 & 0.768 & 0.999 & 0.999 & 0.999 \\
ECHR & 0.985 & 0.985 & 0.985 & 0.763 & 0.772 & 0.772 & 0.408 & 0.408 & 0.408 & 0.680 & 0.701 & 0.701 & 1.000 & 1.000 & 1.000 \\
H

Unnamed: 0_level_0,binoculars_f1_score,binoculars_accuracy,binoculars_roc_auc,e5-small-lora_f1_score,e5-small-lora_accuracy,e5-small-lora_roc_auc,radar_f1_score,radar_accuracy,radar_roc_auc,roberta-Hello-SimpleAI_f1_score,roberta-Hello-SimpleAI_accuracy,roberta-Hello-SimpleAI_roc_auc,roberta-base-ft_f1_score,roberta-base-ft_accuracy,roberta-base-ft_roc_auc
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Web Blogs,0.831471,0.831878,0.831878,0.743755,0.75,0.75,0.425428,0.435044,0.435044,0.769307,0.776747,0.776747,0.980894,0.980895,0.980895
Essays,0.996551,0.996551,0.996551,0.714827,0.732133,0.732133,0.276357,0.282285,0.282285,0.636387,0.645902,0.645902,0.998896,0.998896,0.998896
CNN,0.995345,0.995345,0.995345,0.805323,0.81168,0.81168,0.127795,0.135844,0.135844,0.762697,0.767668,0.767668,0.999365,0.999365,0.999365
ECHR,0.985391,0.985392,0.985392,0.763402,0.77158,0.77158,0.407928,0.408367,0.408367,0.679952,0.701195,0.701195,1.0,1.0,1.0
HoC,0.975679,0.975692,0.975692,0.918151,0.918692,0.918692,0.159002,0.159262,0.159262,0.745657,0.756915,0.756915,0.997904,0.997904,0.997904
arXiv,0.996864,0.996864,0.996864,0.837638,0.841115,0.841115,0.195969,0.22892,0.22892,0.589704,0.638328,0.638328,0.998955,0.998955,0.998955
Gutenberg$_{en}$,0.994949,0.994949,0.994949,0.952828,0.952862,0.952862,0.233326,0.252525,0.252525,0.741316,0.755892,0.755892,0.993266,0.993266,0.993266
Bundestag$_{de}$,0.968929,0.968936,0.968936,0.384663,0.386183,0.386183,0.507895,0.562873,0.562873,0.645089,0.652336,0.652336,0.996024,0.996024,0.996024
Spiegel$_{de}$,0.954294,0.954304,0.954304,0.641101,0.641983,0.641983,0.389987,0.485924,0.485924,0.523051,0.548756,0.548756,0.977549,0.97756,0.97756
All$_{en}$,0.981654,0.981658,0.981658,0.754277,0.764505,0.764505,0.264748,0.270103,0.270103,0.669106,0.677792,0.677792,0.998169,0.998169,0.998169


In [None]:
results = defaultdict(dict)

model_name = "roberta-ft"
for domain, name in name_map.items():
    logs_path = Path("../logs/chatgpt-detector-roberta/") / (domain + ".json")
    with (logs_path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1": data[domain]["f1"],
            # model_name + "_accuracy": data[domain]["accuracy"],
            model_name + "_auroc": data[domain]["auroc"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df