# About

Evaluating LuminarSeq as a document-based classifier in different datasets.

In [2]:
import os
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import torch
import gc
import json

from IPython.display import display, HTML
from luminar.detector import LuminarSequenceDetector
from luminar.utils.cuda import get_best_device
from data_hub.hub import DataHub
from luminar.sequence_classifier import LuminarSequence
from luminar.utils import LuminarSequenceTrainingConfig, ConvolutionalLayerSpec
from luminar.utils.visualization import visualize_detection
from pathlib import Path
from data_hub.sequential_data_processor import SequentialDataProcessor
from luminar.encoder import LuminarEncoder
from tqdm import tqdm
import numpy as np
from luminar.utils import calculate_metrics

torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():
    with torch.cuda.device(torch.cuda.current_device()):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [4]:
datasets = {
    "Ghostbuster": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/Ghostbuster-encoded-gpt2/qnt6a8k3",
        "out_of_domain": "PrismAI",
        "feature_agent": "gpt2",
        "evaluate": False
    },
    "SeqXGPT": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/SeqXGPT-encoded-gpt2/zgfsvd82",
        "out_of_domain": "PrismAI_v2",
        "feature_agent": "gpt2",
        "evaluate": False
    },
    "M4": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/M4-encoded-gpt2/oi2ld9pf",
        "out_of_domain": "PrismAI_v2",
        "feature_agent": "gpt2",
        "evaluate": False
    },
    "MAGE": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/MAGE-encoded-gpt2/pj3t04t0",
        "out_of_domain": "PrismAI_v2",
        "feature_agent": "gpt2",
        "evaluate": False
    },
    "PrismAI_v2": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/PrismAI_v2-encoded-gpt2/e1s2k2du",
        "out_of_domain": "M4",
        "feature_agent": "gpt2",
        "evaluate": False
    },
    "RAID_none": {
        "in_domain_model": "/storage/projects/boenisch/PrismAI/models/luminar_sequence/RAID_none-encoded-gpt2/prkvbp96",
        "out_of_domain": "PrismAI_v2",
        "feature_agent": "gpt2",
        "evaluate": True
    },
}

In [5]:
hub = DataHub((Path.home() / ".hf_token").read_text().strip())

## Evaluation

In [6]:
def evaluate(detector, test_dataset):
    preds = []
    truth = []
    max_iters = 1000000000
    for row in tqdm(test_dataset, desc="Evaluating", unit="sample"):
        if row["label"] not in [0, 1]:
            continue
        if len(preds) > max_iters:
            break
        try:
            result = detector.detect(row["text"])
        except ValueError:
            # Text was too short for LuminarSeq, skipping it.
            continue

        # Skip NaN or infinite values
        if np.isnan(result["avg"]) or np.isinf(result["avg"]):
            continue

        avg = result["avg"] / 100
        preds.append(avg)
        truth.append(row["label"])

    return calculate_metrics(
        y_true=np.array(truth),
        y_scores=np.array(preds),
        threshold=0.5
    )

In [None]:
device = get_best_device()
os.makedirs("sequence_metrics", exist_ok=True)

for key in datasets.keys():
    dataset = datasets[key]
    if not dataset["evaluate"]:
        print(f"Not evaluating {key}.")
        continue
    print("Doing: ", key)

    # Load data
    splits = hub.get_splits(f"TheItCrOw/{key}")
    test_dataset = splits["test"]
    print("Test Length: ", len(test_dataset))

    # In-Domain
    detector = LuminarSequenceDetector(model_path=dataset["in_domain_model"],
                                       feature_agent=dataset["feature_agent"],
                                       device=device)
    in_domain_metrics = evaluate(detector, test_dataset)
    print("In Domain: ", in_domain_metrics)

    # Out of Domain
    ood_dataset = datasets[dataset["out_of_domain"]]
    detector = LuminarSequenceDetector(model_path=ood_dataset["in_domain_model"],
                                       feature_agent=ood_dataset["feature_agent"],
                                       device=device)
    ood_metrics = evaluate(detector, test_dataset)
    print("Out of Domain: ", ood_metrics)
    with open(f'./sequence_metrics/results_{key}.json', 'w') as fp:
        json.dump({"in_domain": in_domain_metrics, "ood": ood_metrics}, fp)

Not evaluating Ghostbuster.
Not evaluating SeqXGPT.
Not evaluating M4.
Not evaluating MAGE.
Not evaluating PrismAI_v2.
Doing:  RAID_none


README.md:   0%|          | 0.00/550 [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/264M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/654910 [00:00<?, ? examples/s]

Filter (num_proc=32):   0%|          | 0/654910 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/654910 [00:00<?, ? examples/s]

Label ID mapping:
0 → human
1 → ai
2 → fusion
train distribution:
  ai: 444035 (96.9%)
  human: 14402 (3.1%)
eval distribution:
  ai: 63434 (96.9%)
  human: 2057 (3.1%)
test distribution:
  ai: 126867 (96.9%)
  human: 4115 (3.1%)
Test Length:  130982
Loading LuminarSequenceDetector from /storage/projects/boenisch/PrismAI/models/luminar_sequence/RAID_none-encoded-gpt2/prkvbp96 to device cuda:0
LuminarSequenceTrainingConfig(feature_len=512, num_intermediate_likelihoods=13, apply_delta_augmentation=True, apply_product_augmentation=False, conv_layer_shapes=[[32, 5, 1], [64, 5, 1], [32, 3, 1]], projection_dim=64, lstm_hidden_dim=256, lstm_layers=1, stack_spans=5, hf_dataset='TheItCrOw/RAID_none-encoded-gpt2', dataset_root_path='/storage/projects/stoeckel/prismai/encoded/fulltext/', models_root_path='/storage/projects/boenisch/PrismAI/models/luminar_sequence/', domain=None, agent='gpt_4o_mini_gemma2_9b', feature_agent='gpt2', max_epochs=100, batch_size=128, early_stopping_patience=8, rescale

[nltk_data] Downloading package punkt to
[nltk_data]     /home/staff_homes/kboenisc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loaded.


Evaluating:   0%|                                                                                               | 0/130982 [00:00<?, ?sample/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Evaluating:   0%|                                                                                    | 74/130982 [00:07<1:00:17, 36.19sample/s]