# OpenGhostbuster

## Baselines

In [None]:
import gc
import json
from pathlib import Path

import numpy as np
import torch
from datasets import Dataset, load_dataset
from tqdm.auto import tqdm
import pandas as pd

from luminar.utils.data import batched_dynamic, transpose_batch

In [None]:
preds_path = Path("/storage/projects/stoeckel/GhostWriter/predictions")

In [None]:
dataset: Dataset = load_dataset("TheItCrOw/GhostWriter", split="test")

### RADAR

In [None]:
from luminar.baselines.radar import Radar

In [None]:
# %%script echo skipping

pred_path_model = preds_path / "radar"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = Radar(device="cuda:1")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
with tqdm(  # type: ignore
    tok_dataset,
    desc=f"Processing {corpus}",
    position=1,
) as tq:
    for batch in batched_dynamic(tq, 512 * 128, key="length"):  # type: ignore
        batch = transpose_batch(batch)
        preds = detector.process(batch)
        batch["y_score"] = preds["y_scores"]
        predictions = pd.concat(
            [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
        )

        tq.set_postfix(
            {
                "batch_size": len(batch["input_ids"]),
                "effective_length": max(batch["length"]) * len(batch["input_ids"]),
            }
        )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass

In [None]:
from luminar.baselines.trainable.e5_lora import E5Lora

In [None]:
%%script echo skipping

pred_path_model = preds_path / "e5_lora"
pred_path_model.mkdir(parents=True, exist_ok=True)
corpus = "test"

detector = E5Lora(device="cuda:1")
tok_dataset = dataset.map(
    detector.tokenize,
    input_columns=["text"],
    batched=True,
    batch_size=1024,
    desc="Tokenizing",
)
tok_dataset = tok_dataset.sort("length")

predictions = pd.DataFrame()
with tqdm(  # type: ignore
    tok_dataset,
    desc=f"Processing {corpus}",
    position=1,
) as tq:
    for batch in batched_dynamic(tq, 1024 * 32, key="length"):  # type: ignore
        batch = transpose_batch(batch)
        preds = detector.process(batch)
        batch["y_score"] = preds["y_scores"]
        predictions = pd.concat(
            [predictions, pd.DataFrame.from_dict(batch)], ignore_index=True
        )

        tq.set_postfix(
            {
                "batch_size": len(batch["input_ids"]),
                "effective_length": max(batch["length"]) * len(batch["input_ids"]),
            }
        )

predictions = predictions.drop(columns=["text", "input_ids", "attention_mask"])
predictions.to_csv(str(pred_path_model / f"{corpus}.csv"))

try:
    detector.to("cpu")
    del detector
    gc.collect()
    torch.cuda.empty_cache()
except Exception:
    pass