# Luminar

## Baselines: Neural Network Models

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


# Source: Ghostbuster, Verma et al. (2024)
def get_scores(labels, probabilities, calibrated=False, precision=6):
    assert len(labels) == len(probabilities)

    if calibrated:
        threshold = sorted(probabilities)[len(labels) - sum(labels) - 1]
    else:
        threshold = 0.5

    acc = round(float(accuracy_score(labels, probabilities > threshold)), precision)
    f1 = round(float(f1_score(labels, probabilities > threshold)), precision)

    if sum(labels) == 0 or sum(labels) == len(labels):
        auroc = -1
    else:
        auroc = round(float(roc_auc_score(labels, probabilities)), precision)

    return acc, f1, auroc

## Data

In [None]:
import gc
from pathlib import Path

from datasets import Dataset, DatasetDict, load_dataset
from tqdm.auto import tqdm

from luminar.utils import get_matched_datasets

HF_TOKEN = (Path.home() / ".hf_token").read_text().strip()

agent = "gpt_4o_mini"
other_agents = "gemma2_9b"
datasets = {}
for domain in tqdm(
    [
        "blog_authorship_corpus",
        "student_essays",
        "cnn_news",
        "euro_court_cases",
        "house_of_commons",
        "arxiv_papers",
        "gutenberg_en",
        "bundestag",
        "spiegel_articles",
        # "gutenberg_de",
        "en",
        "de",
    ]
):
    datset_config_name = f"{domain}-fulltext"
    dataset_split_name = f"human+{agent}+{other_agents}"
    dataset: Dataset = (
        load_dataset(
            "liberi-luminaris/PrismAI",
            datset_config_name,
            split=dataset_split_name,
            token=HF_TOKEN,
        )  # type: ignore
        .rename_column("label", "labels")
        .filter(
            lambda text: len(text.strip()) > 0,
            input_columns=["text"],
            num_proc=8,
        )
    )
    datasets_matched, dataset_unmatched = get_matched_datasets(dataset, agent)
    datasets_matched["unmatched"] = dataset_unmatched
    datasets[domain] = datasets_matched
del dataset
datasets

### Setup

In [None]:
import numpy as np

from luminar.utils import compute_metrics


def run_detector(detector, datasets: dict[str, DatasetDict], batch_size=32):
    scores = {}
    for config_name, dataset in tqdm(datasets.items(), desc="Predicting on Datasets"):
        dataset: Dataset = dataset["test"].map(
            detector.tokenize,
            input_columns=["text"],
            batched=True,
            batch_size=1024,
            desc="Tokenizing",
        )
        dataset = dataset.sort("length")

        labels = []
        predictions = []
        for batch in dataset.batch(batch_size):
            labels.extend(batch["labels"])
            predictions.extend(detector.process(batch)["prediction"])

        metrics = compute_metrics((np.array(predictions), np.array(labels)))
        scores[config_name] = {k: float(v) for k, v in metrics.items()}
    return scores


def run_detector_tokenized(detector, datasets: dict[str, DatasetDict], batch_size=32):
    scores = {}
    for config_name, dataset in tqdm(datasets.items(), desc="Predicting on Datasets"):
        labels = []
        predictions = []
        for batch in dataset["test"].batch(batch_size):
            labels.extend(batch["labels"])
            predictions.extend(detector.process(batch)["prediction"])

        metrics = compute_metrics((np.array(predictions), np.array(labels)))
        scores[config_name] = {k: float(v) for k, v in metrics.items()}
    return scores


### RoBERTa

In [6]:
import json
from pathlib import Path

In [None]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/chatgpt_roberta_detector/chatgpt_detector.py

import evaluate
import numpy as np
import torch
from datasets import DatasetDict
from tqdm import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)
from transformers.tokenization_utils_base import BatchEncoding

accuracy = evaluate.load("accuracy")


def compute_metrics_acc(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


class ChatGPTDetector:
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = torch.device(device)
        self.tokenizer = AutoTokenizer.from_pretrained(
            "Hello-SimpleAI/chatgpt-detector-roberta"
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "Hello-SimpleAI/chatgpt-detector-roberta"
        ).to(self.device)

    def reset(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "Hello-SimpleAI/chatgpt-detector-roberta"
        ).to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        probs = outputs.logits
        return probs[:, 1].detach().cpu().flatten().tolist()

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def inference(self, texts: list) -> list:
        predictions = []
        for text in tqdm(texts):
            inputs = self.tokenizer(text, truncation=True, return_tensors="pt").to(
                self.device
            )
            outputs = self.model(**inputs)
            probs = outputs.logits.softmax(dim=-1)
            real, fake = probs.detach().cpu().flatten().numpy().tolist()
            predictions.append(fake)
        return predictions

    def train(self, dataset: DatasetDict, training_args: TrainingArguments):
        data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        trainer = Trainer(
            self.model,
            training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["eval"],
            data_collator=data_collator,
            compute_metrics=compute_metrics_acc,
        )

        trainer.train()
        self.model = trainer.model.to(self.device)

In [7]:
def f():
    results = run_detector(ChatGPTDetector(device="cuda:3"), datasets)
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return results


scores_roberta = f()
print(json.dumps(scores_roberta, indent=4))

Predicting on Datasets:   0%|                                                                                                                                                             | 0/11 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Predicting on Datasets:   9%|█████████████▌                                                                                                                                       | 1/11 [00:04<00:46,  4.66s/it]

Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Predicting on Datasets:  18%|███████████████████████████                                                                                                                          | 2/11 [01:12<06:16, 41.86s/it]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Predicting on Datasets:  27%|████████████████████████████████████████▋                                                                                                            | 3/11 [01:42<04:51, 36.42s/it]

Tokenizing:   0%|          | 0/1506 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/1506 [00:00<?, ? examples/s]

Predicting on Datasets:  36%|██████████████████████████████████████████████████████▏                                                                                              | 4/11 [01:55<03:09, 27.03s/it]

Tokenizing:   0%|          | 0/2386 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2386 [00:00<?, ? examples/s]

Predicting on Datasets:  45%|███████████████████████████████████████████████████████████████████▋                                                                                 | 5/11 [02:19<02:35, 25.91s/it]

Tokenizing:   0%|          | 0/2870 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/2870 [00:00<?, ? examples/s]

Predicting on Datasets:  55%|█████████████████████████████████████████████████████████████████████████████████▎                                                                   | 6/11 [03:12<02:56, 35.25s/it]

Tokenizing:   0%|          | 0/594 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/594 [00:00<?, ? examples/s]

Predicting on Datasets:  64%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 7/11 [04:06<02:45, 41.35s/it]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Predicting on Datasets:  73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                        | 8/11 [04:32<01:49, 36.57s/it]

Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Predicting on Datasets:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 9/11 [05:02<01:08, 34.44s/it]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Predicting on Datasets:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 10/11 [08:51<01:34, 94.47s/it]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [09:51<00:00, 53.81s/it]


{
    "blog_authorship_corpus": {
        "f1_human": 0.7663951993141878,
        "f1_ai": 0.5905334335086401,
        "f1_weighted": 0.6784643164114139,
        "accuracy": 0.7025109170305677,
        "roc_auc": 0.7025109170305676,
        "f1_human_median": 0.8231441048034934,
        "f1_ai_median": 0.8231441048034934,
        "f1_weighted_median": 0.8231441048034934,
        "accuracy_median": 0.8231441048034934,
        "roc_auc_median": 0.8231441048034934,
        "threshold_median": 0.1145414453929881,
        "ground_truth_human": 916.0,
        "ground_truth_ai": 916.0
    },
    "student_essays": {
        "f1_human": 0.6941830139471595,
        "f1_ai": 0.5249251365157653,
        "f1_weighted": 0.6095540752314623,
        "accuracy": 0.6278973509933775,
        "roc_auc": 0.6278973509933774,
        "f1_human_median": 0.6826710816777042,
        "f1_ai_median": 0.6826710816777042,
        "f1_weighted_median": 0.6826710816777042,
        "accuracy_median": 0.682671081677704

In [8]:
def f():
    scores_roberta_ft = {}

    model = ChatGPTDetector(device="cuda:0")
    dataset_items = [
        (
            k,
            dataset.map(
                model.tokenize,
                input_columns=["text"],
                batched=True,
                batch_size=1024,
                desc="Tokenizing",
            ).sort("length"),
        )
        for k, dataset in datasets.items()
    ]

    tq = tqdm(dataset_items, desc="Finetuning")
    for config, dataset in tq:
        tq.set_postfix_str(config)
        model.reset()

        training_args = TrainingArguments(
            output_dir=f"../models/chatgpt-detector-roberta/{config}",
            seed=42,
            num_train_epochs=1,
            per_device_train_batch_size=15,
            per_device_eval_batch_size=30,
            eval_strategy="steps",
            eval_steps=50,
            save_strategy="epoch",
            learning_rate=1e-5,
        )
        model.train(dataset, training_args)

        scores_roberta_ft[config] = run_detector_tokenized(model, {config: dataset})

        path = Path("../logs/chatgpt-detector-roberta/")
        path.mkdir(parents=True, exist_ok=True)
        with (path / f"{config}.json").open("w") as fp:
            json.dump(scores_roberta_ft[config], fp, indent=4)

    del model
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return scores_roberta_ft


scores_roberta_ft = f()
print(json.dumps(scores_roberta_ft, indent=4))

Tokenizing:   0%|          | 0/6406 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/914 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1832 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/24280 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/50734 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/7248 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/14496 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/11222 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/16538 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2364 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4726 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/12901 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/5264 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/752 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1506 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/10732 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/8344 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1192 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2386 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/15237 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/10042 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1434 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2870 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2497 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2078 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/298 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/594 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/10552 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/14078 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2012 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4024 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/16165 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/17154 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2450 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4902 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/7524 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/99412 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/14202 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/28404 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/87421 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/31372 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4484 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/8964 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/24224 [00:00<?, ? examples/s]

Finetuning:   0%|                                                                                                                                                 | 0/11 [00:00<?, ?it/s, blog_authorship_corpus]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.22345,0.936543
100,No log,0.196102,0.93326




Batching examples:   0%|          | 0/1832 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.02s/it]
Finetuning:   9%|█████████████▏                                                                                                                                   | 1/11 [01:10<11:47, 70.74s/it, student_essays]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.269518,0.916391
100,No log,0.090071,0.980822
150,No log,0.090734,0.980132
200,No log,0.055652,0.989514
250,No log,0.034602,0.993791
300,No log,0.007964,0.998206
350,No log,0.00804,0.998482
400,No log,0.054871,0.984685
450,No log,0.012667,0.996137
500,0.061500,0.048985,0.987583




Batching examples:   0%|          | 0/14496 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:55<00:00, 55.58s/it]
Finetuning:  18%|███████████████████████████▎                                                                                                                          | 2/11 [08:53<45:10, 301.21s/it, cnn_news]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.06712,0.983503
100,No log,0.044294,0.989425
150,No log,0.033669,0.989425
200,No log,0.034233,0.99154
250,No log,0.036499,0.99154




Batching examples:   0%|          | 0/4726 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:21<00:00, 21.30s/it]
Finetuning:  27%|██████████████████████████████████████▋                                                                                                       | 3/11 [11:23<30:56, 232.06s/it, euro_court_cases]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.001585,1.0




Batching examples:   0%|          | 0/1506 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.91s/it]
Finetuning:  36%|███████████████████████████████████████████████████▋                                                                                          | 4/11 [12:32<19:35, 167.96s/it, house_of_commons]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.099287,0.979027
100,No log,0.053376,0.984899




Batching examples:   0%|          | 0/2386 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:10<00:00, 10.36s/it]
Finetuning:  45%|██████████████████████████████████████████████████████████████████▎                                                                               | 5/11 [14:01<13:56, 139.41s/it, arxiv_papers]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.313431,0.849372
100,No log,0.091,0.983264
150,No log,0.057966,0.98954




Batching examples:   0%|          | 0/2870 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:15<00:00, 15.10s/it]
Finetuning:  55%|███████████████████████████████████████████████████████████████████████████████▋                                                                  | 6/11 [15:46<10:37, 127.53s/it, gutenberg_en]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss




Batching examples:   0%|          | 0/594 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.82s/it]
Finetuning:  64%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 7/11 [16:39<06:53, 103.47s/it, bundestag]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.457674,0.842445
100,No log,0.113712,0.969682
150,No log,0.185112,0.951292
200,No log,0.209413,0.938867




Batching examples:   0%|          | 0/4024 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:18<00:00, 18.03s/it]
Finetuning:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 8/11 [18:50<05:36, 112.07s/it, spiegel_articles]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.707816,0.673878
100,No log,0.645539,0.802449
150,No log,0.433282,0.888571
200,No log,0.270944,0.920408
250,No log,0.264392,0.922857




Batching examples:   0%|          | 0/4902 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:22<00:00, 22.13s/it]
Finetuning:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 9/11 [21:25<04:10, 125.43s/it, en]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.234703,0.923039
100,No log,0.109619,0.973807
150,No log,0.099607,0.977257
200,No log,0.0838,0.98127
250,No log,0.140027,0.971905
300,No log,0.043529,0.990353
350,No log,0.033531,0.992748
400,No log,0.048332,0.98972
450,No log,0.050249,0.987607
500,0.073300,0.057815,0.9881




Batching examples:   0%|          | 0/28404 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:57<00:00, 117.23s/it]
Finetuning:  91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 10/11 [41:18<07:35, 455.24s/it, de]Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Accuracy
50,No log,0.979816,0.691347
100,No log,0.291511,0.890946
150,No log,0.42448,0.875558
200,No log,0.34451,0.88314
250,No log,0.122489,0.960749
300,No log,0.340472,0.91793
350,No log,0.287332,0.902542
400,No log,0.429497,0.875112
450,No log,0.320671,0.911463
500,0.193800,0.307393,0.916592




Batching examples:   0%|          | 0/8964 [00:00<?, ? examples/s]

Predicting on Datasets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:40<00:00, 40.43s/it]
Finetuning: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [46:01<00:00, 251.02s/it, de]


{
    "blog_authorship_corpus": {
        "blog_authorship_corpus": {
            "f1_human": 0.9453758788534343,
            "f1_ai": 0.9443526170798898,
            "f1_weighted": 0.944864247966662,
            "accuracy": 0.9448689956331878,
            "roc_auc": 0.9448689956331878,
            "f1_human_median": 0.9421397379912664,
            "f1_ai_median": 0.9421397379912664,
            "f1_weighted_median": 0.9421397379912664,
            "accuracy_median": 0.9421397379912664,
            "roc_auc_median": 0.9421397379912664,
            "threshold_median": 0.4127888748756461,
            "ground_truth_human": 916.0,
            "ground_truth_ai": 916.0
        }
    },
    "student_essays": {
        "student_essays": {
            "f1_human": 0.9997930320800276,
            "f1_ai": 0.9997930606332345,
            "f1_weighted": 0.999793046356631,
            "accuracy": 0.9997930463576159,
            "roc_auc": 0.999793046357616,
            "f1_human_median": 0.999724061

### RADAR

In [None]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/radar/radar.py

import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class Radar:
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = torch.device(device)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "TrustSafeAI/RADAR-Vicuna-7B",  # cache_dir=os.environ["CACHE_DIR"]
        )
        self.tokenizer = AutoTokenizer.from_pretrained("TrustSafeAI/RADAR-Vicuna-7B")
        self.model.eval()
        self.model.to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 0].exp().tolist()
        return output_probs

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def inference(self, texts: list) -> list:
        predictions = []
        for text in tqdm(texts):
            with torch.no_grad():
                inputs = self.tokenizer(
                    [text],
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors="pt",
                )
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                output_probs = (
                    F.log_softmax(self.model(**inputs).logits, -1)[:, 0].exp().tolist()
                )
            predictions.append(output_probs[0])
        return predictions

In [None]:
def f():
    results = run_detector(Radar(device="cuda:3"), datasets)
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return results


scores_radar = f()
scores_radar

### Binoculars

In [None]:
# Source: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/binoculars/utils/metrics.py

import numpy as np
import torch
import transformers

ce_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
softmax_fn = torch.nn.Softmax(dim=-1)


def perplexity(
    encoding: transformers.BatchEncoding,
    logits: torch.Tensor,
    median: bool = False,
    temperature: float = 1.0,
):
    shifted_logits = logits[..., :-1, :].contiguous() / temperature
    shifted_labels = encoding.input_ids[..., 1:].contiguous()
    shifted_attention_mask = encoding.attention_mask[..., 1:].contiguous()

    if median:
        ce_nan = ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels).masked_fill(
            ~shifted_attention_mask.bool(), float("nan")
        )
        ppl = np.nanmedian(ce_nan.cpu().float().numpy(), 1)

    else:
        ppl = (
            ce_loss_fn(shifted_logits.transpose(1, 2), shifted_labels)
            * shifted_attention_mask
        ).sum(1) / shifted_attention_mask.sum(1)
        ppl = ppl.to("cpu").float().numpy()

    return ppl


def entropy(
    p_logits: torch.Tensor,
    q_logits: torch.Tensor,
    encoding: transformers.BatchEncoding,
    pad_token_id: int,
    median: bool = False,
    sample_p: bool = False,
    temperature: float = 1.0,
):
    vocab_size = p_logits.shape[-1]
    total_tokens_available = q_logits.shape[-2]
    p_scores, q_scores = p_logits / temperature, q_logits / temperature

    p_proba = softmax_fn(p_scores).view(-1, vocab_size)

    if sample_p:
        p_proba = torch.multinomial(
            p_proba.view(-1, vocab_size), replacement=True, num_samples=1
        ).view(-1)

    q_scores = q_scores.view(-1, vocab_size)

    ce = ce_loss_fn(input=q_scores, target=p_proba).view(-1, total_tokens_available)
    padding_mask = (encoding.input_ids != pad_token_id).type(torch.uint8)

    if median:
        ce_nan = ce.masked_fill(~padding_mask.bool(), float("nan"))
        agg_ce = np.nanmedian(ce_nan.cpu().float().numpy(), 1)
    else:
        agg_ce = (
            ((ce * padding_mask).sum(1) / padding_mask.sum(1)).to("cpu").float().numpy()
        )

    return agg_ce

In [None]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/binoculars/binoculars.py

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.set_grad_enabled(False)

GLOBAL_BINOCULARS_THRESHOLD = (
    0.9015310749276843  # selected using Falcon-7B and Falcon-7B-Instruct at bfloat16
)
# DEVICE_1 = "cuda:0" if torch.cuda.is_available() else "cpu"
# DEVICE_2 = "cuda:1" if torch.cuda.device_count() > 1 else DEVICE_1
DEVICE_1 = "cuda:2"
DEVICE_2 = "cuda:3"


class Binoculars(object):
    def __init__(
        self,
        observer_name_or_path: str = "tiiuae/falcon-7b",
        performer_name_or_path: str = "tiiuae/falcon-7b-instruct",
        use_bfloat16: bool = True,
        max_token_observed: int = 512,
    ) -> None:
        # assert_tokenizer_consistency(observer_name_or_path, performer_name_or_path)

        self.observer_model = AutoModelForCausalLM.from_pretrained(
            observer_name_or_path,
            device_map={"": DEVICE_1},
            trust_remote_code=True,
            # cache_dir=os.environ["CACHE_DIR"],
            torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32,
        )
        self.performer_model = AutoModelForCausalLM.from_pretrained(
            performer_name_or_path,
            device_map={"": DEVICE_2},
            trust_remote_code=True,
            # cache_dir=os.environ["CACHE_DIR"],
            torch_dtype=torch.bfloat16 if use_bfloat16 else torch.float32,
        )

        self.observer_model.eval()
        self.performer_model.eval()

        self.tokenizer = AutoTokenizer.from_pretrained(observer_name_or_path)
        if not self.tokenizer.pad_token:
            self.tokenizer.pad_token = self.tokenizer.eos_token

        self.max_token_observed = max_token_observed

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=self.max_token_observed,
            return_length=True,
            return_token_type_ids=False,
        )

    @torch.inference_mode()
    def _get_logits(
        self, encodings: transformers.BatchEncoding
    ) -> tuple[torch.Tensor, torch.Tensor]:
        observer_logits = self.observer_model(
            **encodings.to(self.observer_model.device)
        ).logits
        performer_logits = self.performer_model(
            **encodings.to(self.performer_model.device)
        ).logits
        if DEVICE_1 != "cpu":
            torch.cuda.synchronize()
        return observer_logits, performer_logits

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encodings = self.tokenizer.pad(inputs, return_tensors="pt")
        observer_logits, performer_logits = self._get_logits(encodings)
        ppl = perplexity(encodings, performer_logits)
        x_ppl = entropy(
            observer_logits.to(DEVICE_1),
            performer_logits.to(DEVICE_1),
            encodings.to(DEVICE_1),
            self.tokenizer.pad_token_id,
        )
        binoculars_scores = ppl / x_ppl
        return binoculars_scores.tolist()

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

In [None]:
from datasets import disable_caching

disable_caching()

In [None]:
def f():
    results = run_detector(Binoculars(), datasets, batch_size=16)
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    gc.collect()
    torch.cuda.empty_cache()
    return results


scores_binoculars = f()
scores_binoculars

### E5-Small LoRA


In [None]:
# Modified from: RAID, Dugan et al. 2024
# > https://github.com/liamdugan/raid/blob/main/detectors/models/radar/radar.py

import torch
import torch.nn.functional as F
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers.tokenization_utils_base import BatchEncoding


class E5Lora:
    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.device = torch.device(device)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "MayZhou/e5-small-lora-ai-generated-detector"
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            "MayZhou/e5-small-lora-ai-generated-detector"
        )
        self.model.eval()
        self.model.to(self.device)

    def tokenize(self, texts: list[str]) -> BatchEncoding:
        return self.tokenizer(
            texts,
            padding=False,
            truncation=True,
            max_length=512,
            return_length=True,
        )

    @torch.inference_mode()
    def predict(self, inputs: dict) -> list[float]:
        encoding = self.tokenizer.pad(inputs, return_tensors="pt").to(self.device)
        outputs = self.model(**encoding)
        output_probs = F.log_softmax(outputs.logits, -1)[:, 0].exp().tolist()
        return output_probs

    def process(self, inputs: dict) -> dict[str, list[float]]:
        return {
            "prediction": self.predict(
                {
                    "input_ids": inputs["input_ids"],
                    "attention_mask": inputs["attention_mask"],
                }
            )
        }

    @torch.inference_mode()
    def inference(self, texts: list) -> list:
        predictions = []
        for text in tqdm(texts):
            with torch.no_grad():
                inputs = self.tokenizer(
                    [text],
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors="pt",
                )
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                output_probs = (
                    F.log_softmax(self.model(**inputs).logits, -1)[:, 0].exp().tolist()
                )
            predictions.append(output_probs[0])
        return predictions

In [None]:
import json


def f():
    results = run_detector(E5Lora(device="cuda:0"), datasets)
    gc.collect()
    torch.cuda.synchronize()
    torch.cuda.empty_cache()
    return results


scores_e5 = f()

with open("../logs/e5-small-lora.json", "w") as fp:
    json.dump(scores_e5, fp, indent=4)

print(json.dumps(scores_e5, indent=4))

In [None]:
domains = [
    "Web Blogs",
    "Essays",
    "CNN",
    "ECHR",
    "HoC",
    "arXiv",
    "Gutenberg$_{en}$",
    "Bundestag$_{de}$",
    "Spiegel$_{de}$",
    "Gutenberg$_{de}$",
    "All$_{en}$",
    "All$_{de}$",
]

name_map = {
    "blog_authorship_corpus": "Web Blogs",
    "student_essays": "Essays",
    "cnn_news": "CNN",
    "euro_court_cases": "ECHR",
    "house_of_commons": "HoC",
    "arxiv_papers": "arXiv",
    "gutenberg_en": "Gutenberg$_{en}$",
    "bundestag": "Bundestag$_{de}$",
    "spiegel_articles": "Spiegel$_{de}$",
    "gutenberg_de": "Gutenberg$_{de}$",
    "en": "All$_{en}$",
    "de": "All$_{de}$",
}

In [None]:
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd

results = defaultdict(dict)
for path in Path("../logs/").iterdir():
    if path.suffix == ".json":
        with path.open("r") as fp:
            data = json.load(fp)
        model_name = path.stem
        for domain, scores in data.items():
            results[name_map[domain]].update(
                {
                    model_name + "_f1": scores["f1"],
                    model_name + "_accuracy": scores["accuracy"],
                    model_name + "_auroc": scores["auroc"],
                }
            )

model_name = "roberta-ft"
for domain, name in name_map.items():
    path = Path("../logs/chatgpt-detector-roberta/") / (domain + ".json")
    with (path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1": data[domain]["test_f1_"],
            model_name + "_accuracy": data[domain]["accuracy"],
            model_name + "_auroc": data[domain]["auroc"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df

In [None]:
results = defaultdict(dict)

model_name = "roberta-ft"
for domain, name in name_map.items():
    path = Path("../logs/chatgpt-detector-roberta/") / (domain + ".json")
    with (path).open("r") as fp:
        data = json.load(fp)
    results[name].update(
        {
            model_name + "_f1": data[domain]["f1"],
            # model_name + "_accuracy": data[domain]["accuracy"],
            model_name + "_auroc": data[domain]["auroc"],
        }
    )

metric_df = (
    pd.DataFrame([{"domain": domain} | dd for domain, dd in results.items()])
    .set_index("domain")
    .sort_index(key=lambda x: list(map(domains.index, x)))
)
print(metric_df.to_latex(float_format="%.3f", index=True))
metric_df