# Luminar

## Minimal Code

In [1]:
from typing import Iterable, NamedTuple

import torch
from datasets import Dataset
from torch import nn
from torch.utils.data import DataLoader, Subset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

### Luminar Encoder

1. Pre-Process Inputs: tokenize and pass through LLM, recording hidden states
2. Calculate _Intermediate Likelihoods_: pass each hidden state through the models LM head

In [None]:
class LuminarEncoder:
    def __init__(
        self,
        feature_dim: int = 256,
        model_name_or_path: str = "gpt2",
        device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
    ):
        self.feature_dim = feature_dim
        self.device = torch.device(device)

        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path
        )
        if not hasattr(self.tokenizer, "pad_token") or self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.pad_token_id = self.tokenizer.pad_token_id

        self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
            model_name_or_path
        )
        self.model = self.model.to(self.device)

        if hasattr(self.model, "lm_head"):
            self.model_lm_head: nn.Linear = self.model.lm_head
        elif hasattr(self.model.model, "lm_head"):
            self.model_lm_head: nn.Linear = self.model.model.lm_head
        else:
            raise ValueError("Could not find lm_head in model")

    def __call__(self, batch: dict[str, list[str]]) -> dict[str, list[torch.Tensor]]:
        return {"features": self.process(batch["text"])}

    def process(self, batch: list[str]) -> list[torch.Tensor]:
        encoding = self.tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=self.feature_dim,
            return_tensors="pt",
        )
        batch_hidden_states = self.forward(encoding.input_ids, encoding.attention_mask)

        intermediate_likelihoods = []
        for input_ids, hidden_states in zip(encoding.input_ids, batch_hidden_states):
            intermediate_likelihoods.append(
                self.compute_intermediate_likelihoods(input_ids, hidden_states)
            )

        return intermediate_likelihoods

    @torch.no_grad()
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> Iterable[tuple[torch.Tensor, ...]]:
        outputs = self.model(
            input_ids=input_ids.to(self.device),
            attention_mask=attention_mask.to(self.device),
            output_hidden_states=True,
        )

        # unpack hidden states to get one list of tensors per input sequence,
        # instead of one hidden state per layer in the model
        return zip(*outputs.hidden_states)  # type: ignore

    @torch.no_grad()
    def compute_intermediate_likelihoods(
        self,
        input_ids: torch.Tensor,
        hidden_states: tuple[torch.Tensor],
    ) -> torch.Tensor:
        labels = input_ids[1:].view(-1, 1)

        seq_length = min(labels.ne(self.pad_token_id).sum(), self.feature_dim)
        labels = labels[:seq_length].to(self.device)

        intermediate_likelihoods = []
        for hs in hidden_states:
            hs: torch.Tensor = hs[:seq_length].to(self.device)
            il = (
                # get layer logits
                self.model_lm_head(hs)
                # calculate likelihoods
                .softmax(-1)
                # get likelihoods of input tokens
                .gather(-1, labels)
                .squeeze(-1)
                .cpu()
            )
            del hs

            # pad with zeros if sequence is shorter than required feature_dim
            if seq_length < self.feature_dim:
                il = torch.cat([il, torch.zeros(self.feature_dim - seq_length)])

            intermediate_likelihoods.append(il)

        # stack intermediate likelihoods to get tensor of shape (feature_dim, num_layers)
        return torch.stack(intermediate_likelihoods, dim=1)

### Luminar Classifier

CNN-based classifier using _Intermediate Likelihoods_ as input features.
Here, we utilize these inherently 2D values (`seq_len * num_layers`) as 1D inputs where the second dimension is treated as input channels.

In [3]:
class ConvolutionalLayerSpec(NamedTuple):
    channels: int
    kernel_size: int | tuple[int, int]
    stride: int = 1

    @property
    def kernel_size_1d(self):
        if isinstance(self.kernel_size, int):
            return self.kernel_size
        return self.kernel_size[0]

    @property
    def kernel_size_2d(self):
        if isinstance(self.kernel_size, int):
            return (self.kernel_size, self.kernel_size)
        return self.kernel_size

    @property
    def padding(self) -> int:
        return (self.kernel_size_1d - 1) // 2

    def __repr__(self):
        return repr(tuple(self))


DEFAULT_CONV_LAYER_SHAPES = ((64, 5), (128, 3), (128, 3), (128, 3), (64, 3))


class LuminarClassifier(nn.Module):
    def __init__(
        self,
        conv_layer_shapes: Iterable[ConvolutionalLayerSpec] = DEFAULT_CONV_LAYER_SHAPES,
        projection_dim: int | None = None,
    ):
        super().__init__()
        self.conv_layers = nn.Sequential()
        for conv in conv_layer_shapes:
            conv = ConvolutionalLayerSpec(*conv)
            self.conv_layers.append(
                nn.LazyConv1d(
                    conv.channels,
                    conv.kernel_size,  # type: ignore
                    conv.stride,
                    conv.padding,
                ),
            )
            self.conv_layers.append(
                nn.LeakyReLU(),
            )
        self.conv_layers.append(nn.Flatten())

        if projection_dim:
            self.projection = nn.Sequential(
                nn.LazyLinear(projection_dim), nn.LeakyReLU()
            )
        else:
            self.projection = nn.Identity()

        self.classifier = nn.LazyLinear(1)

    def forward(self, features: torch.Tensor):
        # We are using 2D features (so `features` is a 3D tensor)
        # but we want to treat the second feature dimension as channels.
        # Thus, we need to transpose the tensor here
        features = features.transpose(1, 2)

        for layer in self.conv_layers:
            features = layer(features)

        return self.classifier(self.projection(features.flatten(1)))


## Example

### Prepare Data

In [4]:
import bz2
import json

raw_dataset = Dataset.from_list(
    [
        json.loads(line)
        for line in bz2.open(
            "../data/prismai-cnn_news-fulltext-gpt_4o_mini.jsonl.bz2", "rt"
        )
    ],
)
raw_dataset

Dataset({
    features: ['_id', 'samples'],
    num_rows: 11814
})

In [5]:
from collections import defaultdict


def flatten_samples(batch: dict):
    result = defaultdict(list)
    for element in batch["samples"]:
        for sample in element:
            for key, value in sample.items():
                result[key].append(value)
    return result


dataset = raw_dataset.train_test_split(test_size=0.2).map(
    flatten_samples,
    batched=True,
    remove_columns=raw_dataset.column_names,
)
dataset

Map:   0%|          | 0/9451 [00:00<?, ? examples/s]

Map:   0%|          | 0/2363 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['agent', 'label', 'label_str', 'text'],
        num_rows: 18902
    })
    test: Dataset({
        features: ['agent', 'label', 'label_str', 'text'],
        num_rows: 4726
    })
})

### Encode Samples

In [None]:
# del encoder
# torch.cuda.empty_cache()
encoder = LuminarEncoder(128, model_name_or_path="gpt2")

In [7]:
dataset = dataset.map(encoder, batched=True, batch_size=64)

Map:   0%|          | 0/18902 [00:00<?, ? examples/s]

Map:   0%|          | 0/4726 [00:00<?, ? examples/s]

### Run Training

In [8]:
model = LuminarClassifier()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
from tqdm import tqdm


train_dataset = dataset["train"].with_format("torch", ["features", "label"])
for batch in tqdm(DataLoader(train_dataset, 32)):
    optimizer.zero_grad()
    features = batch["features"]
    labels = batch["label"].float().unsqueeze(-1)

    preds = model(features)

    loss = criterion(preds, labels)

    loss.backward()
    optimizer.step()

100%|██████████| 591/591 [00:34<00:00, 17.09it/s]


In [10]:
import numpy as np
from sklearn.metrics import f1_score


y_pred, y_truth, losses = [], [], []
test_dataset = dataset["test"].with_format("torch", ["features", "label"])
for batch in tqdm(DataLoader(test_dataset, 32)):
    with torch.no_grad():
        features = batch["features"]
        labels = batch["label"].float().unsqueeze(-1)
        preds = model(features)

        y_pred.extend(preds.sigmoid().round().squeeze().tolist())
        y_truth.extend(labels.squeeze().tolist())

        loss = criterion(preds, labels)
        losses.append(loss.item())

print(f"loss={np.mean(losses)}")
print(f"f1={f1_score(y_truth, y_pred)}")

100%|██████████| 148/148 [00:07<00:00, 20.05it/s]

loss=0.05647700630140969
f1=0.9840255591054313





In [11]:
from raid.utils import load_data

train_noadv_df = load_data(split="train", include_adversarial=False)
test_noadv_df = load_data(split="test", include_adversarial=False)
extra_noadv_df = load_data(split="extra", include_adversarial=False)

Downloading https://dataset.raid-bench.xyz/train_none.csv (801662741B) to /home/mastoeck/.cache/raid/train_none.csv


100%|██████████| 802M/802M [00:23<00:00, 34.5MB/s] 


Downloading https://dataset.raid-bench.xyz/test_none.csv (80996324B) to /home/mastoeck/.cache/raid/test_none.csv


100%|██████████| 81.0M/81.0M [00:00<00:00, 107MB/s]


Downloading https://dataset.raid-bench.xyz/extra_none.csv (256763541B) to /home/mastoeck/.cache/raid/extra_none.csv


100%|██████████| 257M/257M [00:02<00:00, 114MB/s] 


In [29]:
from raid import run_detection, run_evaluation
from itertools import batched


@torch.no_grad()
def detector(texts: list[str]) -> list[float]:
    all_preds = []
    for batch in batched(tqdm(texts), 32):
        features = encoder.process(batch)
        preds = model(torch.stack(features).cpu())
        all_preds.extend(
            preds.sigmoid()
            .squeeze()
            # .round()
            .tolist()
        )
    return all_preds

eval_df = train_noadv_df.sample(10240)
raid_preds = run_detection(detector, eval_df)

  0%|          | 0/10240 [00:00<?, ?it/s]

100%|██████████| 10240/10240 [08:04<00:00, 21.14it/s]


In [30]:
raid_eval = run_evaluation(raid_preds, eval_df, per_domain_tuning=False)
raid_eval

{'scores': [{'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'sampling',
   'repetition_penalty': 'yes',
   'attack': 'none',
   'tp': 22,
   'fn': 4,
   'accuracy': 0.8461538461538461},
  {'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'sampling',
   'repetition_penalty': 'no',
   'attack': 'none',
   'tp': 36,
   'fn': 2,
   'accuracy': 0.9473684210526315},
  {'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'sampling',
   'repetition_penalty': 'all',
   'attack': 'none',
   'tp': 58,
   'fn': 6,
   'accuracy': 0.90625},
  {'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'greedy',
   'repetition_penalty': 'yes',
   'attack': 'none',
   'tp': 38,
   'fn': 0,
   'accuracy': 1.0},
  {'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'greedy',
   'repetition_penalty': 'no',
   'attack': 'none',
   'tp': 35,
   'fn': 2,
   'accuracy': 0.9459459459459459},
  {'domain': 'wiki',
   'model': 'mistral-chat',
   'decoding': 'greedy'