In [1]:
import importlib
import os
from itertools import product

from dotenv import load_dotenv
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch import loggers as pl_loggers
from lightning.pytorch.tuner import Tuner

load_dotenv("../env")
seed_everything(42)

Seed set to 42


42

In [2]:
import luminar.mongo

importlib.reload(luminar.mongo)
from luminar.mongo import MongoDBAdapter

In [3]:
import luminar.document.data
import luminar.features

importlib.reload(luminar.document.data)
importlib.reload(luminar.features)

from luminar.document.data import DocumentClassificationDataModule
from luminar.features import OneDimFeatures, ThreeDimFeatures, TwoDimFeatures

In [4]:
import luminar.document.model

importlib.reload(luminar.document.model)

from luminar.document.model import ConvolutionalLayerSpec, DocumentClassficationModel

In [5]:
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

In [11]:
feature_size = OneDimFeatures(256)
domain = [
    "blog_authorship_corpus",
    "student_essays",
    "cnn_news",
    "bundestag",
    "spiegel_articles",
    "euro_court_cases",
    "house_of_commons",
    "arxiv_papers",
    "gutenberg",
][2]

db = MongoDBAdapter(
    os.environ.get("MONGO_DB_CONNECTION"),
    "prismai",
    "collected_items",
    "synthesized_texts",
    "log_likelihoods",
    domain=domain,
    source_collection_limit=1500,
)
dm = DocumentClassificationDataModule(db, feature_size)
dm.prepare_data()

Caching Enabled - Dataset Already Cached


In [12]:
dm.setup("fit")

Caching Enabled - Loading Dataset in setup(fit)


In [13]:
lr = 0.0005
pdim = 64
model = DocumentClassficationModel(
    feature_size,
    projection_dim=pdim,
    learning_rate=lr,
    warmup_steps=100,
)
trainer = Trainer(
    max_epochs=50,
    logger=pl_loggers.TensorBoardLogger(
        save_dir="logs/",
        name=domain,
    ),
    gradient_clip_val=0.5,
    deterministic=True,
    callbacks=[EarlyStopping(monitor="roc_auc", mode="max", patience=3)],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [14]:
# tuner = Tuner(trainer)
trainer.fit(model, dm)
(metrics,) = trainer.validate(model, dataloaders=[dm.val_dataloader()])

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/mastoeck/Projects/PrismAI/PrismAI/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.

  | Name        | Type              | Params | Mode  | In sizes   | Out sizes
-----------------------------------------------------------------------------------
0 | conv_layers | ModuleList        | 148 K  | train | ?          | ?        
1 | classifier  | Sequential        | 1.0 M  | train | [1, 16384] | [1, 1]   
2 | criterion   | BCEWithLogitsLoss | 0      | train | ?          | ?        
-----------------------------------------------------------------------------------
1.2 M     Trainable params
0         Non-trainable params
1.2 M     Total

Caching Enabled - Dataset Already Cached


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/mastoeck/Projects/PrismAI/PrismAI/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Caching Enabled - Dataset Already Cached


/home/mastoeck/Projects/PrismAI/PrismAI/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

In [15]:
# tuner.lr_find(model, datamodule=dm)