In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
import sys
import torch
import wandb
import gc
from pathlib import Path

sys.path.insert(0, str(Path().resolve().parent))

from luminar.utils import get_best_device
from luminar.utils import LuminarSequenceTrainingConfig, ConvolutionalLayerSpec
from luminar.sequence_trainer import LuminarSequenceTrainer
from luminar.encoder import LuminarEncoder
from data_hub.sequential_data_processor import SequentialDataProcessor
from data_hub.hub import DataHub

device = get_best_device()
print(device)

cuda:0


In [36]:
data_hub = DataHub((Path.home() / ".hf_token").read_text().strip())
dataset = data_hub.get_splits("liberi-luminaris/MAGE-encoded-gpt2")
print(dataset)

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Label ID mapping:
0 → human
1 → ai
2 → fusion
train distribution:
  ai: 197276 (65.1%)
  human: 105600 (34.9%)
eval distribution:
  ai: 28183 (65.1%)
  human: 15086 (34.9%)
test distribution:
  human: 30172 (34.9%)
  ai: 56365 (65.1%)
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length'],
        num_rows: 302876
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length'],
        num_rows: 43269
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length'],
        num_rows: 86537
    })
})


In [37]:
data_processor = SequentialDataProcessor(dataset, LuminarEncoder("gpt2", device=device))
dataset = data_processor.process_for_sequential()
print(dataset)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/staff_homes/kboenisc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenizing text and extracting offsets:   0%|          | 0/302876 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1041 > 1024). Running this sequence through the model will result in indexing errors


Tokenizing text and extracting offsets:   0%|          | 0/43269 [00:00<?, ? examples/s]

Tokenizing text and extracting offsets:   0%|          | 0/86537 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/302876 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/43269 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/86537 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/302876 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/43269 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/86537 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 302876
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 43269
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 86537
    })
})


**Sanity Checks**

Are there any fusion labels (2)? We need to handle them for now.

In [38]:
splits_to_check = ["train", "eval", "test"]

total_invalid_label_count = 0
total_length_mismatch_count = 0

for split in splits_to_check:
    if split not in dataset:
        print(f"Split '{split}' not found in dataset. Skipping.")
        continue

    print(f"\nChecking split: {split}")
    invalid_label_count = 0
    length_mismatch_count = 0

    for i, example in enumerate(dataset[split]):
        labels = example["span_labels"]
        spans = example["sentence_token_spans"]

        # Check for invalid labels
        if any(label not in (0, 1) for label in labels):
            print(f"Invalid labels at {split}[{i}]: {labels}")
            invalid_label_count += 1

        # Check for length mismatch
        if len(labels) != len(spans):
            print(f"Length mismatch at {split}[{i}]: {len(labels)} labels vs {len(spans)} spans")
            length_mismatch_count += 1

    print(f"✅ {split} summary: {invalid_label_count} invalid label entries, {length_mismatch_count} length mismatches")

    total_invalid_label_count += invalid_label_count
    total_length_mismatch_count += length_mismatch_count

# Overall summary
print("\nOverall Summary:")
print(f"Total entries with invalid labels: {total_invalid_label_count}")
print(f"Total entries with length mismatch: {total_length_mismatch_count}")



Checking split: train
✅ train summary: 0 invalid label entries, 0 length mismatches

Checking split: eval
✅ eval summary: 0 invalid label entries, 0 length mismatches

Checking split: test
✅ test summary: 0 invalid label entries, 0 length mismatches

Overall Summary:
Total entries with invalid labels: 0
Total entries with length mismatch: 0


In [39]:
train_dataset, test_dataset, test_loader = data_processor.dataset_to_luminar_sequence_dataset(dataset)
print(train_dataset)

<luminar.utils.training.LuminarSequenceDataset object at 0x7ce53f2f1610>


In [40]:
falcon_config = LuminarSequenceTrainingConfig(**{
    "feature_len": 512,
    "num_intermediate_likelihoods": 33,
    "conv_layer_shapes": (
        ConvolutionalLayerSpec(128, 5),
        ConvolutionalLayerSpec(256, 5),
    ),
    "projection_dim": 128,
    "lstm_hidden_dim": 256,
    "lstm_layers": 1,
    "apply_delta_augmentation": False,
    "apply_product_augmentation": True,
    "max_epochs": 115,
    "kfold": 3,
    "early_stopping_patience": 8,
    "learning_rate": 1.03e-3,
    "batch_size": 128,
    "seed": 42,
    "rescale_features": False,
    "stack_spans": 3
})

gpt2_config = LuminarSequenceTrainingConfig(**{
    "feature_len": 512,
    "num_intermediate_likelihoods": 13,
    "conv_layer_shapes": (
        ConvolutionalLayerSpec(128, 5),
        ConvolutionalLayerSpec(256, 5),
    ),
    "projection_dim": 64,
    "lstm_hidden_dim": 128,
    "lstm_layers": 1,
    "apply_delta_augmentation": False,
    "apply_product_augmentation": True,
    "max_epochs": 115,
    "kfold": 3,
    "early_stopping_patience": 8,
    "learning_rate": 2.05e-3,
    "batch_size": 128,
    "seed": 42,
    "rescale_features": False,
    "stack_spans": 3
})

In [41]:
log_to_wandb = False

torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():
    with torch.cuda.device(torch.cuda.current_device()):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [42]:
trainer = LuminarSequenceTrainer(train_dataset=train_dataset,
                                 test_data_loader = test_loader,
                                 collate_fn=data_processor.collate_fn,
                                 log_to_wandb=log_to_wandb,
                                 config=gpt2_config,
                                 device=get_best_device(),
                                 use_experimental_attention=False)

metrics, best_model = trainer.train()
avg_f1 = metrics.get("f1_score", 0.0)


LuminarSequenceTrainingConfig(feature_len=512, num_intermediate_likelihoods=13, apply_delta_augmentation=False, apply_product_augmentation=True, conv_layer_shapes=((128, 5, 1), (256, 5, 1)), projection_dim=64, lstm_hidden_dim=128, lstm_layers=1, stack_spans=3, dataset_root_path='/storage/projects/stoeckel/prismai/encoded/fulltext/', models_root_path='/storage/projects/boenisch/PrismAI/models/luminar_sequence/', domain='student_essays', agent='gpt_4o_mini_gemma2_9b', feature_agent='gpt2_512', max_epochs=115, batch_size=128, early_stopping_patience=8, rescale_features=False, kfold=3, learning_rate=0.00205, seed=42)

Epoch 1/115
Train Loss: 0.4740 | Eval Loss: 0.4010

Epoch 2/115
Train Loss: 0.3743 | Eval Loss: 0.3491

Epoch 3/115
Train Loss: 0.3482 | Eval Loss: 0.3301

Epoch 4/115
Train Loss: 0.3320 | Eval Loss: 0.3102

Epoch 5/115
Train Loss: 0.3178 | Eval Loss: 0.3131

Epoch 6/115
Train Loss: 0.3059 | Eval Loss: 0.3017

Epoch 7/115
Train Loss: 0.2964 | Eval Loss: 0.3287

Epoch 8/115
T

KeyboardInterrupt: 