# OpenGhostbuster

## Baselines

In [1]:
import gc
import json
from pathlib import Path

import numpy as np
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
import pandas as pd

from luminar.utils.data import batched_dynamic, transpose_batch


In [2]:
preds_path = Path("/storage/projects/stoeckel/GhostWriter/predictions")

In [3]:
dataset: Dataset = load_dataset("TheItCrOw/GhostWriter", split="test")

### Binoculars

In [4]:
from luminar.baselines.binoculars import GLOBAL_BINOCULARS_THRESHOLD, Binoculars

In [5]:
%%script echo skipping

pred_path_model = preds_path / "binoculars_falcon-7b"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = Binoculars("tiiuae/falcon-7b", "tiiuae/falcon-7b-instruct")

tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
for batch in tqdm(  # type: ignore
    tok_dataset.batch(16),
    desc=f"Processing {corpus}",
    position=1,
):
    batch: dict[str, list]
    preds = detector.process(batch)
    batch["y_score"] = preds["y_scores"]
    batch["y_pred"] = [
        int(xppl < GLOBAL_BINOCULARS_THRESHOLD) for xppl in preds["y_scores"]
    ]
    predictions = pd.concat(
        [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
    )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.observer_model.to("cpu")
    detector.performer_model.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

skipping


### Fast-DetectGPT

In [6]:
from luminar.baselines.fast_detectgpt import FastDetectGPT, FastDetectGPTwithScoring

In [7]:
%%script echo skipping

pred_path_model = preds_path / "fast_detectgpt-gpt-j-6B_gpt-neo-2.7B"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = FastDetectGPTwithScoring("EleutherAI/gpt-j-6B", "EleutherAI/gpt-neo-2.7B")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
for batch in tqdm(  # type: ignore
    tok_dataset.batch(16),
    desc=f"Processing {corpus}",
    position=1,
):
    batch: dict[str, list]
    preds = detector.process(batch)
    batch["y_score"] = preds["y_scores"]
    predictions = pd.concat(
        [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
    )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

skipping


In [8]:
%%script echo skipping

pred_path_model = preds_path / "fast_detectgpt-gpt-neo-2.7B_gpt-j-6B"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = FastDetectGPTwithScoring("EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
for batch in tqdm(  # type: ignore
    tok_dataset.batch(16),
    desc=f"Processing {corpus}",
    position=1,
):
    batch: dict[str, list]
    preds = detector.process(batch)
    batch["y_score"] = preds["y_scores"]
    predictions = pd.concat(
        [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
    )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

skipping


In [9]:
%%script echo skipping

pred_path_model = preds_path / "fast_detectgpt-falcon-7b"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = FastDetectGPT("tiiuae/falcon-7b")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
for batch in tqdm(  # type: ignore
    tok_dataset.batch(16),
    desc=f"Processing {corpus}",
    position=1,
):
    batch: dict[str, list]
    preds = detector.process(batch)
    batch["y_score"] = preds["y_scores"]
    predictions = pd.concat(
        [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
    )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

skipping


### DetectLLM-LLR

In [10]:
from luminar.baselines.detect_llm import DetectLLM_LRR

In [11]:
# %%script echo skipping

pred_path_model = preds_path / "detectllm_llr-falcon-7b"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = DetectLLM_LRR("tiiuae/falcon-7b", device="cuda")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
predictions = pd.DataFrame()
with (
    tqdm(  # type: ignore
        tok_dataset.skip(70616),
        desc=f"Processing {corpus}",
        position=1,
    ) as tq
):
    for batch in batched_dynamic(tq, 8192, key="length"):  # type: ignore
        batch = transpose_batch(batch)
        preds = detector.process(batch)
        batch["y_score"] = preds["y_scores"]
        predictions = pd.concat(
            [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
        )

        tq.set_postfix(
            {
                "batch_size": len(batch["input_ids"]),
                "effective_length": max(batch["length"]) * len(batch["input_ids"]),
            }
        )

old_predictions = pd.read_csv(str(pred_path_model / f"{corpus}.csv.part"))
predictions = pd.concat([old_predictions, predictions]).drop(
    columns=["text", "input_ids", "attention_mask"]
)
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

Processing test:   0%|          | 0/38680 [00:00<?, ?it/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### RADAR

In [None]:
from luminar.baselines.radar import Radar

In [None]:
%%script echo skipping

pred_path_model = preds_path / "radar"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = Radar(device="cuda")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
for batch in tqdm(  # type: ignore
    tok_dataset.batch(16),
    desc=f"Processing {corpus}",
    position=1,
):
    batch: dict[str, list]
    preds = detector.process(batch)
    batch["y_score"] = preds["y_scores"]
    predictions = pd.concat(
        [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
    )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass