# About

Training of LuminarSequenceClassifier on the PrismAI dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import torch
import wandb

from typing import Final, Callable
from pathlib import Path

sys.path.insert(0, str(Path().resolve().parent.parent))

from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, ConcatDataset
from IPython.display import display, Markdown
from datasets import load_dataset
from numpy._typing import NDArray
from torch.utils.data import Dataset, Subset
from datasets import DatasetDict
from luminar.utils.sequential_data import SequentialDataService
from luminar.utils.data import get_pad_to_fixed_length_fn, get_matched_datasets
from luminar.utils.cuda import get_best_device
from luminar.utils.training import ConvolutionalLayerSpec, LuminarSequenceTrainingConfig
from luminar.encoder import LuminarEncoder

import numpy as np
import glob

In [3]:
class Config:
    HF_TOKEN: Final[str] = (Path.home() / ".hf_token").read_text().strip()
    #DATASET_PATH: Final[str] = "liberi-luminaris/PrismAI-encoded-gpt2"
    DATASET_ROOT_PATH: Final[str] = "/storage/projects/stoeckel/prismai/encoded/fulltext/"
    #DATASET_ROOT_PATH: Final[str] = "/mnt/c/home/projects/prismAI/data/encoded/fulltext/"
    NUM_INTERMEDIATE_LIKELIHOODS: Final[int] = 13 # gpt2=13, falcon=33
    FEATURE_LEN = 512
    BATCH_SIZE = 512
    SEED = 42


## Loading & Preprocessing the datasets

Load the datasets for the training.

In [4]:
domains = ['student_essays']#, 'student_essays']
agents = ['gpt_4o_mini_gemma2_9b']
feature_agents = ['gpt2_512'] #, 'falcon_7b_512']

luminar_encoder = LuminarEncoder(max_len=Config.FEATURE_LEN)
data_service = SequentialDataService(luminar_encoder, Config.BATCH_SIZE, Config.FEATURE_LEN)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/staff_homes/kboenisc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
datasets = {}

for domain in domains:
    for agent in agents:
        for feature_agent in feature_agents:
            dataset_path = Path(Config.DATASET_ROOT_PATH) / agent / feature_agent / domain

            dataset_dict = data_service.load_dataset(dataset_path)

            datasets.setdefault(domain, {}).setdefault(agent, {})[feature_agent] = dataset_dict
            print(f"Loaded dataset for domain '{domain}' with agent '{agent}' and feature agent '{feature_agent}'")

datasets

Loading dataset from: /storage/projects/stoeckel/prismai/encoded/fulltext/gpt_4o_mini_gemma2_9b/gpt2_512/student_essays


Tokenizing, padding, and aligning sentences:   0%|          | 0/50734 [00:00<?, ? examples/s]

Tokenizing, padding, and aligning sentences:   0%|          | 0/14496 [00:00<?, ? examples/s]

Tokenizing, padding, and aligning sentences:   0%|          | 0/7248 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/50734 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/14496 [00:00<?, ? examples/s]

Assigning labels to sentence spans:   0%|          | 0/7248 [00:00<?, ? examples/s]

Loaded dataset for domain 'student_essays' with agent 'gpt_4o_mini_gemma2_9b' and feature agent 'gpt2_512'


{'student_essays': {'gpt_4o_mini_gemma2_9b': {'gpt2_512': DatasetDict({
       train: Dataset({
           features: ['agent', 'id_sample', 'id_source', 'labels', 'text', 'features', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
           num_rows: 50734
       })
       test: Dataset({
           features: ['agent', 'id_sample', 'id_source', 'labels', 'text', 'features', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
           num_rows: 14496
       })
       eval: Dataset({
           features: ['agent', 'id_sample', 'id_source', 'labels', 'text', 'features', 'tokenized_text', 'sentence_token_spans', 'span_labels'],
           num_rows: 7248
       })
   })}}}

In [7]:
# Sanity check
idx = 0
for domain, agents_dict in datasets.items():
    for agent, feature_agents_dict in agents_dict.items():
        for feature_agent, dataset in feature_agents_dict.items():
            md = f"""
**Domain:** `{domain}`
**Agent:** `{agent}`
**Feature Agent:** `{feature_agent}`

**Train:** {len(dataset['train'])}
**Test:** {len(dataset['test'])}
**Eval:** {len(dataset['eval'])}

**Example-Features:**
`{dataset['train'][idx]['features'][:2]}...`

**Feature-Shape:**
`{np.asarray(dataset['train'][idx]['features']).shape}`

**Example text:**
`{dataset['train'][idx]['text']}`

**Example-Tokenized Text:**
`{dataset['train'][idx]['tokenized_text'][:10]}...`

**Tokenized Text Shape:**
`{np.asarray(dataset['train'][idx]['tokenized_text']).shape}`

**Sentence-Token-Spans:**
`{dataset['train'][idx]['sentence_token_spans']}`

**Example Sentence-Token Span decoded:**
`{luminar_encoder.tokenizer.decode(np.asarray(dataset['train'][idx]['tokenized_text'])[dataset['train'][idx]['sentence_token_spans'][0][0]:dataset['train'][idx]['sentence_token_spans'][0][1]].flatten().tolist())}`

**Example span labels:**
`{dataset['train'][idx]['span_labels']}`

**Example label:**
`{dataset['train'][idx]['labels']}`

---
"""
            display(Markdown(md))


**Domain:** `student_essays`
**Agent:** `gpt_4o_mini_gemma2_9b`
**Feature Agent:** `gpt2_512`

**Train:** 50734
**Test:** 14496
**Eval:** 7248

**Example-Features:**
`[[4.4132913899375126e-05, 4.203895392974451e-45, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.00015184479707386345], [9.98157247522613e-06, 2.268475629563227e-09, 2.4706817147723825e-10, 2.598772308459729e-10, 4.608947598572222e-11, 2.177385494128714e-10, 7.711160811448015e-13, 6.366168985201537e-13, 2.281021511211531e-17, 9.533206545892253e-25, 7.987888483894126e-31, 0.0, 0.029878340661525726]]...`

**Feature-Shape:**
`(512, 13)`

**Example text:**
`[Your Name] [Your Address] [City, State, Zip Code] [Email Address] [Date] The Honorable [Senator's Name] [Senator's Office Address] [City, State, Zip Code] Dear Senator [Senator's Last Name], I hope this letter finds you well. I’m writing to you today as a concerned citizen who’s really worried about the way our presidential elections work. Honestly, I can’t wrap my head around the Electoral College anymore. It feels like an outdated system that just doesn’t represent us, the people. I mean, how is it fair that a candidate can win the presidency without winning the popular vote? It’s like saying my vote doesn’t count just because I live in a state that leans one way or another. That doesn’t seem right, does it? We should all have an equal say in who leads our country, and the popular vote is the only way to truly reflect the will of the people. The Electoral College creates a situation where some votes are more valuable than others, and that’s just plain wrong. It leads to candidates ignoring vast swathes of the country because they know they won’t win those states. Isn’t it time we changed that? We need a system that encourages candidates to engage with all of us, not just the folks in swing states. I urge you to consider advocating for a popular vote system. It’s time for our elections to truly represent the voice of the people. Thank you for your time, and I hope you’ll take my concerns to heart.`

**Example-Tokenized Text:**
`[[58], [7120], [6530], [60], [685], [7120], [17917], [60], [685], [14941]]...`

**Tokenized Text Shape:**
`(512, 1)`

**Sentence-Token-Spans:**
`[[0, 61], [61, 87], [87, 103], [103, 121], [121, 141], [141, 168], [168, 179], [179, 209], [209, 233], [233, 256], [256, 266], [266, 288], [288, 300], [300, 317], [317, 336]]`

**Example Sentence-Token Span decoded:**
`[Your Name] [Your Address] [City, State, Zip Code] [Email Address] [Date] The Honorable [Senator's Name] [Senator's Office Address] [City, State, Zip Code] Dear Senator [Senator's Last Name], I hope this letter finds you well.`

**Example span labels:**
`[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]`

**Example label:**
`1`

---


# Data Loaders

Create the data loaders for training and evaluation.

In [8]:
train_datasets = []
test_loaders = []

all_train_parts = []
all_test_parts = []

for domain in domains:
    for agent in feature_agents:
        print(f"Creating datasets for domain: {domain}")

        train_dataset, test_dataset, test_loader = data_service.dataset_to_luminar_sequence_dataset(
            datasets[domain]["gpt_4o_mini_gemma2_9b"][agent]
        )

        print(f"Train Dataset: {len(train_dataset)}")
        train_datasets.append((domain, agent, train_dataset))
        all_train_parts.append(train_dataset)

        print(f"Test Dataset: {len(test_dataset)}")
        test_loaders.append((domain, agent, test_loader))
        all_test_parts.append(test_dataset)

# Now concat all
all_domains_train_datasets = ConcatDataset(all_train_parts)
all_domains_test_dataset = ConcatDataset(all_test_parts)
all_domains_test_loader = DataLoader(
    all_domains_test_dataset, batch_size=data_service.batch_size, shuffle=False, collate_fn=data_service._collate_fn
)

print(f"All Train: {len(all_domains_train_datasets)}")
train_datasets.append(("all", "all", all_domains_train_datasets))

print(f"All Test: {len(all_domains_test_dataset)}")
test_loaders.append(("all", "all", all_domains_test_loader))

Creating datasets for domain: student_essays
Train Dataset: 57982
Test Dataset: 14496
All Train: 57982
All Test: 14496


# Training

Train the model using K-Fold Cross Validation on the training dataset.

In [None]:
from luminar.sequence_trainer import LuminarSequenceTrainer

device = get_best_device()

config = LuminarSequenceTrainingConfig(**{
    "feature_len": Config.FEATURE_LEN,
    "num_intermediate_likelihoods": Config.NUM_INTERMEDIATE_LIKELIHOODS,
    "conv_layer_shapes": (
        ConvolutionalLayerSpec(64, 5),
        ConvolutionalLayerSpec(128, 5),
        ConvolutionalLayerSpec(64, 3),
    ),
    "projection_dim": 64,
    "lstm_hidden_dim": 64,
    "lstm_layers": 1,
    "apply_delta_augmentation": False,
    "apply_product_augmentation": False,
    "max_epochs": 120,
    "kfold": 3,
    "early_stopping_patience": 17,
    "learning_rate": 7e-4,
    "batch_size": Config.BATCH_SIZE,
    "seed": Config.SEED,
    "rescale_features": False,
    "stack_spans": 0,
    "dataset_root_path": Config.DATASET_ROOT_PATH
})
print(config)

LuminarSequenceTrainingConfig(feature_len=512, num_intermediate_likelihoods=13, apply_delta_augmentation=False, apply_product_augmentation=True, conv_layer_shapes=((64, 5, 1), (128, 5, 1), (64, 3, 1)), projection_dim=64, lstm_hidden_dim=64, lstm_layers=1, stack_spans=0, dataset_root_path='/storage/projects/stoeckel/prismai/encoded/fulltext/', models_root_path='/storage/projects/boenisch/PrismAI/models/luminar_sequence/', domain='student_essays', agent='gpt_4o_mini_gemma2_9b', feature_agent='gpt2_512', max_epochs=120, batch_size=512, early_stopping_patience=17, rescale_features=False, kfold=3, learning_rate=0.0007, seed=42)


In [13]:
log_to_wandb = False

In [14]:
for train_dataset, test_loader in zip(train_datasets, test_loaders):
    config.domain = train_dataset[0]
    print(f"Training on domain: {config.domain}")
    config.feature_agent = train_dataset[1]

    # Reset the random seed for reproducibility
    torch.manual_seed(Config.SEED)
    np.random.seed(Config.SEED)

    # Unpack the datasets
    train_dataset = train_dataset[2]
    test_loader = test_loader[2]

    print(f"Train Dataset Size: {len(train_dataset)}")
    print(f"Test Dataset Size: {len(test_loader.dataset)}")

    # Init wandb
    if log_to_wandb:
        run = wandb.init(
            project="Luminar",
            config=config.__dict__,
            reinit=True,
            group=f"notebook_runs",
            name=f"notebook_{config.domain}_{config.feature_agent}_{config.feature_agent}"
        )

    trainer = LuminarSequenceTrainer(train_dataset=train_dataset,
                                     test_data_loader = test_loader,
                                     collate_fn=data_service._collate_fn,
                                     log_to_wandb=log_to_wandb,
                                     config=config,
                                     device=device,
                                     use_experimental_attention=True)
    
    metrics, best_model = trainer.train()
    avg_f1 = metrics.get("f1_score", 0.0)
    if log_to_wandb:
        wandb.log({"objective_f1": avg_f1})

    # Save model locally
    model_store_path = Path(config.models_root_path) / config.domain / wandb.run.id
    best_model.save(model_store_path)

    # Upload to wandb as an artifact
    if log_to_wandb:
        artifact = wandb.Artifact(name=f"luminar-sequence-{wandb.run.id}", type="model")
        artifact.add_dir(str(model_store_path))
        wandb.log_artifact(artifact)
        wandb.finish()

Training on domain: student_essays
Train Dataset Size: 57982
Test Dataset Size: 14496


Epoch 1/120
Train Loss: 0.6896 | Eval Loss: 0.6913

Epoch 2/120
Train Loss: 0.6898 | Eval Loss: 0.6889

Epoch 3/120
Train Loss: 0.6895 | Eval Loss: 0.6889

Epoch 4/120


KeyboardInterrupt: 