# Luminar

## Minimal Code

In [2]:
from typing import Iterable, NamedTuple

import torch
from datasets import Dataset
from torch import nn
from torch.utils.data import DataLoader, Subset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

### Luminar Encoder

1. Pre-Process Inputs: tokenize and pass through LLM, recording hidden states
2. Calculate _Intermediate Likelihoods_: pass each hidden state through the models LM head

In [3]:
class LuminarEncoder:
    def __init__(
        self,
        feature_dim: int = 256,
        model_name_or_path: str = "gpt2",
        device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
    ):
        self.feature_dim = feature_dim
        self.device = torch.device(device)

        self.tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path
        )
        if not hasattr(self.tokenizer, "pad_token") or self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.pad_token_id = self.tokenizer.pad_token_id

        self.model: PreTrainedModel = AutoModelForCausalLM.from_pretrained(
            model_name_or_path
        )
        self.model = self.model.to(self.device)

        if hasattr(self.model, "lm_head"):
            self.model_lm_head: nn.Linear = self.model.lm_head
        elif hasattr(self.model.model, "lm_head"):
            self.model_lm_head: nn.Linear = self.model.model.lm_head
        else:
            raise ValueError("Could not find lm_head in model")

    def __call__(self, batch: dict[str, list[str]]) -> dict[str, list[torch.Tensor]]:
        return {"features": self.process(batch["text"])}

    def process(self, batch: list[str]) -> list[torch.Tensor]:
        encoding = self.tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=self.feature_dim,
            return_tensors="pt",
        )
        batch_hidden_states = self.forward(encoding.input_ids, encoding.attention_mask)

        intermediate_likelihoods = []
        for input_ids, hidden_states in zip(encoding.input_ids, batch_hidden_states):
            intermediate_likelihoods.append(
                self.compute_intermediate_likelihoods(input_ids, hidden_states)
            )

        return intermediate_likelihoods

    @torch.no_grad()
    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor,
    ) -> Iterable[tuple[torch.Tensor, ...]]:
        outputs = self.model(
            input_ids=input_ids.to(self.device),
            attention_mask=attention_mask.to(self.device),
            output_hidden_states=True,
        )

        # unpack hidden states to get one list of tensors per input sequence,
        # instead of one hidden state per layer in the model
        return zip(*outputs.hidden_states)  # type: ignore

    @torch.no_grad()
    def compute_intermediate_likelihoods(
        self,
        input_ids: torch.Tensor,
        hidden_states: tuple[torch.Tensor],
    ) -> torch.Tensor:
        labels = input_ids[1:].view(-1, 1)

        seq_length = min(labels.ne(self.pad_token_id).sum(), self.feature_dim)
        labels = labels[:seq_length].to(self.device)

        intermediate_likelihoods = []
        for hs in hidden_states:
            hs: torch.Tensor = hs[:seq_length].to(self.device)
            il = (
                # get layer logits
                self.model_lm_head(hs)
                # calculate likelihoods
                .softmax(-1)
                # get likelihoods of input tokens
                .gather(-1, labels)
                .squeeze(-1)
                .cpu()
            )
            del hs

            # pad with zeros if sequence is shorter than required feature_dim
            if seq_length < self.feature_dim:
                il = torch.cat([il, torch.zeros(self.feature_dim - seq_length)])

            intermediate_likelihoods.append(il)

        # stack intermediate likelihoods to get tensor of shape (feature_dim, num_layers)
        return torch.stack(intermediate_likelihoods, dim=1)

### Luminar Classifier

CNN-based classifier using _Intermediate Likelihoods_ as input features.
Here, we utilize these inherently 2D values (`seq_len * num_layers`) as 1D inputs where the second dimension is treated as input channels.

In [4]:
class ConvolutionalLayerSpec(NamedTuple):
    channels: int
    kernel_size: int | tuple[int, int]
    stride: int = 1

    @property
    def kernel_size_1d(self):
        if isinstance(self.kernel_size, int):
            return self.kernel_size
        return self.kernel_size[0]

    @property
    def kernel_size_2d(self):
        if isinstance(self.kernel_size, int):
            return (self.kernel_size, self.kernel_size)
        return self.kernel_size

    @property
    def padding(self) -> int:
        return (self.kernel_size_1d - 1) // 2

    def __repr__(self):
        return repr(tuple(self))


DEFAULT_CONV_LAYER_SHAPES = ((64, 5), (128, 3), (128, 3), (128, 3), (64, 3))


class LuminarClassifier(nn.Module):
    def __init__(
        self,
        conv_layer_shapes: Iterable[ConvolutionalLayerSpec] = DEFAULT_CONV_LAYER_SHAPES,
        projection_dim: int | None = None,
    ):
        super().__init__()
        self.conv_layers = nn.Sequential()
        for conv in conv_layer_shapes:
            conv = ConvolutionalLayerSpec(*conv)
            self.conv_layers.append(
                nn.LazyConv1d(
                    conv.channels,
                    conv.kernel_size,  # type: ignore
                    conv.stride,
                    conv.padding,
                ),
            )
            self.conv_layers.append(
                nn.LeakyReLU(),
            )
        self.conv_layers.append(nn.Flatten())

        if projection_dim:
            self.projection = nn.Sequential(
                nn.LazyLinear(projection_dim), nn.LeakyReLU()
            )
        else:
            self.projection = nn.Identity()

        self.classifier = nn.LazyLinear(1)

    def forward(self, features: torch.Tensor):
        # We are using 2D features (so `features` is a 3D tensor)
        # but we want to treat the second feature dimension as channels.
        # Thus, we need to transpose the tensor here
        features = features.transpose(1, 2)

        for layer in self.conv_layers:
            features = layer(features)

        return self.classifier(self.projection(features.flatten(1)))


## Example

### Prepare Data

In [5]:
from datasets import load_from_disk

dataset = load_from_disk('PrismAI_full_encoded.hf')
dataset


Dataset({
    features: ['agent', 'label', 'label_str', 'text', 'features'],
    num_rows: 186838
})

In [6]:
dataset = dataset.train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['agent', 'label', 'label_str', 'text', 'features'],
        num_rows: 149470
    })
    test: Dataset({
        features: ['agent', 'label', 'label_str', 'text', 'features'],
        num_rows: 37368
    })
})

In [7]:
dataset['train'][0]

{'agent': 'gpt-4o-mini',
 'label': 1,
 'label_str': 'ai',
 'text': 'Dear Diary,\n\nOMG, today was such a blast! 🎉 So, I met up with my squad after school, and we decided to binge-watch that new movie everyone’s been talking about. Like, how did I not know about this sooner? We all piled into my bestie Mia’s living room, and it was total chaos! 😂 You know how it goes—snacks everywhere, pillows flying, and us trying to figure out who’s hogging the remote. \n\nSo, we started with popcorn, but then someone (cough, Jake) brought those weird sour gummy worms, and honestly, they were kinda gross? But we ate them anyway because, duh, free snacks! 🍬 Then, we just talked about all the drama at school. Seriously, can you believe Mrs. Thompson is cracking down on our dress code? Like, what’s wrong with a little flair? 🙄 I mean, I get it, but can’t we express ourselves a bit? \n\nAfter the movie, we ended up talking about our summer plans. I’m super excited about the beach trip, but also kinda nerv

In [11]:
model = LuminarClassifier()
model.load_state_dict(torch.load("luminar_classifier.pt"))
model = model.to()
model.eval()

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [12]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


# Source: Ghostbuster, Verma et al. (2024)
def get_scores(labels, probabilities, calibrated=False, precision=6):
    assert len(labels) == len(probabilities)

    if calibrated:
        threshold = sorted(probabilities)[len(labels) - sum(labels) - 1]
    else:
        threshold = 0.5

    acc = round(float(accuracy_score(labels, probabilities > threshold)), precision)
    f1 = round(float(f1_score(labels, probabilities > threshold)), precision)

    if sum(labels) == 0 or sum(labels) == len(labels):
        auroc = -1
    else:
        auroc = round(float(roc_auc_score(labels, probabilities)), precision)

    return acc, f1, auroc

In [15]:
import numpy as np
import tqdm
from sklearn.metrics import f1_score


y_probs, y_pred, y_truth, losses = [], [], [], []
test_dataset = dataset["test"].with_format("torch", columns=["features", "label"])
for batch in tqdm.tqdm(DataLoader(test_dataset, batch_size=32)):
    with torch.no_grad():
        features = batch["features"]
        labels = batch["label"].float().unsqueeze(-1)

        preds = model(features)
        probs = preds.sigmoid()

        y_probs.extend(probs.squeeze().tolist())
        y_pred.extend(probs.round().squeeze().tolist())
        y_truth.extend(labels.squeeze().tolist())

        loss = criterion(preds, labels)
        losses.append(loss.item())

print(f"Loss: {np.mean(losses):.4f}")
print(f"F1 Score (threshold 0.5): {f1_score(y_truth, y_pred):.4f}")

# Ghostbuster scores
acc, f1, auroc = get_scores(np.array(y_truth), np.array(y_probs))
print(f"Accuracy: {acc}")
print(f"F1 (Ghostbuster): {f1}")
print(f"AUROC: {auroc}")

  0%|                                                                                                                                | 0/1168 [00:00<?, ?it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1168/1168 [00:36<00:00, 32.14it/s]

Loss: 0.1343
F1 Score (threshold 0.5): 0.9453
Accuracy: 0.945542
F1 (Ghostbuster): 0.945324
AUROC: 0.989345



