In [1]:
%load_ext autoreload
%autoreload

In [2]:
import sys
import torch
import wandb
import gc
from pathlib import Path

sys.path.insert(0, str(Path().resolve().parent))

from luminar.utils import get_best_device
from luminar.utils import LuminarSequenceTrainingConfig, ConvolutionalLayerSpec
from luminar.sequence_trainer import LuminarSequenceTrainer
from luminar.encoder import LuminarEncoder
from data_hub.sequential_data_processor import SequentialDataProcessor
from data_hub.hub import DataHub

device = get_best_device()
print(device)

cuda:0


In [3]:
data_hub = DataHub((Path.home() / ".hf_token").read_text().strip())
dataset = data_hub.get_splits("TheItCrOw/RAID_none-encoded-gpt2")
print(dataset)

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/57 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/55 [00:00<?, ?it/s]

Label ID mapping:
0 → human
1 → ai
2 → fusion
train distribution:
  ai: 444035 (96.9%)
  human: 14402 (3.1%)
eval distribution:
  ai: 63434 (96.9%)
  human: 2057 (3.1%)
test distribution:
  ai: 126867 (96.9%)
  human: 4115 (3.1%)
DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'offset_mapping', 'length', 'features', 'feature_length'],
        num_rows: 458437
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'offset_mapping', 'length', 'features', 'feature_length'],
        num_rows: 65491
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'offset_mapping', 'length', 'features', 'feature_length'],
        num_rows: 130982
    })
})


In [4]:
data_processor = SequentialDataProcessor(LuminarEncoder("gpt2", device=device))
dataset = data_processor.process_for_training(dataset)
print(dataset)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/staff_homes/kboenisc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenizing text and extracting offsets:   0%|          | 0/458437 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 1024). Running this sequence through the model will result in indexing errors


Tokenizing text and extracting offsets:   0%|          | 0/65491 [00:00<?, ? examples/s]

Tokenizing text and extracting offsets:   0%|          | 0/130982 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/458437 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/65491 [00:00<?, ? examples/s]

Assigning sentence spans to tokenized text:   0%|          | 0/130982 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/458437 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/65491 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/130982 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 458437
    })
    eval: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 65491
    })
    test: Dataset({
        features: ['id', 'text', 'domain', 'date', 'source', 'lang', 'label', 'agent', 'type', 'length', 'features', 'feature_length', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
        num_rows: 130982
    })
})


In [5]:
train_dataset, test_dataset, test_loader = data_processor.dataset_to_luminar_sequence_dataset(dataset)
train_dataset

<luminar.utils.training.LuminarSequenceDataset at 0x7b5de5dd50a0>

In [6]:
falcon_config = LuminarSequenceTrainingConfig(**{
    "feature_len": 512,
    "num_intermediate_likelihoods": 33,
    "conv_layer_shapes": (
        ConvolutionalLayerSpec(128, 5),
        ConvolutionalLayerSpec(256, 5),
    ),
    "projection_dim": 128,
    "lstm_hidden_dim": 256,
    "lstm_layers": 1,
    "apply_delta_augmentation": False,
    "apply_product_augmentation": True,
    "max_epochs": 115,
    "kfold": 3,
    "early_stopping_patience": 8,
    "learning_rate": 1.03e-3,
    "batch_size": 64,
    "seed": 42,
    "rescale_features": False,
    "stack_spans": 3
})

gpt2_config = LuminarSequenceTrainingConfig(**{
    "feature_len": 512,
    "num_intermediate_likelihoods": 13,
    "conv_layer_shapes": (
        ConvolutionalLayerSpec(32, 5),
        ConvolutionalLayerSpec(64, 5),
        ConvolutionalLayerSpec(32, 3),
    ),
    "projection_dim": 64,
    "lstm_hidden_dim": 256,
    "lstm_layers": 1,
    "apply_delta_augmentation": True,
    "apply_product_augmentation": False,
    "weighted_sampling": True,
    "max_epochs": 100,
    "kfold": 3,
    "early_stopping_patience": 8,
    "learning_rate": 0.004,
    "batch_size": 64,
    "seed": 42,
    "rescale_features": False,
    "stack_spans": 5
})

In [7]:
log_to_wandb = False

torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():
    with torch.cuda.device(torch.cuda.current_device()):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

In [8]:
trainer = LuminarSequenceTrainer(train_dataset=train_dataset,
                                 test_data_loader = test_loader,
                                 collate_fn=data_processor.collate_fn,
                                 log_to_wandb=log_to_wandb,
                                 config=gpt2_config,
                                 device=get_best_device(),
                                 use_experimental_attention=False)

metrics, best_model = trainer.train()
avg_f1 = metrics.get("f1_score", 0.0)


Applying weighted_sampling to this fold.
[Fold 1] Weighted sampling ON | samples pos/neg = 338183/11102 | spans pos/neg = 3700431/144711
LuminarSequenceTrainingConfig(feature_len=512, num_intermediate_likelihoods=13, apply_delta_augmentation=True, apply_product_augmentation=False, weighted_sampling=True, conv_layer_shapes=((32, 5, 1), (64, 5, 1), (32, 3, 1)), projection_dim=64, lstm_hidden_dim=256, lstm_layers=1, stack_spans=5, hf_dataset='liberi-luminaris/Ghostbuster-encoded-gpt2', dataset_root_path='/storage/projects/stoeckel/prismai/encoded/fulltext/', models_root_path='/storage/projects/boenisch/PrismAI/models/luminar_sequence/', domain=None, agent='gpt_4o_mini_gemma2_9b', feature_agent='gpt2', max_epochs=100, batch_size=64, early_stopping_patience=8, rescale_features=False, kfold=3, learning_rate=0.004, seed=42)

Epoch 1/100
Train Loss: 0.4511 | Eval Loss: 0.3548

Epoch 2/100
Train Loss: 0.3631 | Eval Loss: 0.3322

Epoch 3/100
Train Loss: 0.3461 | Eval Loss: 0.2973

Epoch 4/100
T

KeyboardInterrupt: 