### Import Libraries

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import hydra
from hydra import initialize, compose
import torch
import matplotlib.pyplot as plt
import lightning as L
from lightning.pytorch.tuner import Tuner
from lightning.pytorch.callbacks import ModelSummary, ModelCheckpoint


In [3]:
from src.models.transformer import Transformer
from src.data.bert_data import BertDataModule

### Load Config

In [4]:
try:
    initialize(version_base="1.3", config_path=".", job_name="all")
except ValueError:
    hydra.core.global_hydra.GlobalHydra.instance().clear()
    initialize(version_base="1.3", config_path=".", job_name="all")

cfg = compose(config_name="config")

### Load the Data

In [5]:
datamodule = BertDataModule(
    seq_len=cfg.bert.seq_len,
    n_vocab=cfg.bert.n_vocab,
    trainpth=f"{cfg.bert.data}/training.txt",
    vocabpth=f"{cfg.bert.data}/vocab.txt",
    batch_size=cfg.bert.batch_size,
)
datamodule.setup()

### Initilize the model

In [6]:
bert = Transformer(
    arch="bert",
    d_model=cfg.bert.embed_size,
    max_len=cfg.bert.seq_len,
    num_heads=cfg.bert.n_heads,
    num_layers=cfg.bert.n_layers,
    num_classes=cfg.bert.n_vocab,
    d_ff=cfg.bert.inner_ff_size,
    dropout=cfg.bert.dropout,
    lr=cfg.bert.lr,
    weight_decay=cfg.bert.weight_decay,
    betas=cfg.bert.betas,
)

In [7]:
callbacks = [
    ModelSummary(max_depth=3),
    ModelCheckpoint(
        dirpath=f"./model_checkpoints/bert",
        filename="bert_{epoch}",
        monitor="train_loss",
        mode="min",
        save_last=True,
    ),
]

### Initialize trainer

In [8]:
trainer = L.Trainer(
    accelerator="gpu",
    devices=[1],
    num_nodes=1,
    max_epochs=10,
    callbacks=callbacks,
    limit_val_batches=0,
)

Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Begin Training

In [9]:
trainer.fit(model=bert, datamodule=datamodule)

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /home/ravi.naik/learning/era/s17/s17lit/lightning_logs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

   | Name                             | Type                    | Params
------------------------------------------------------------------------------
0  | embedding                        | Embeddings              | 5.1 M 
1  | embedding.lut                    | Embedding               | 5.1 M 
2  | pe                               | PositionalEncoding      | 0     
3  | embedding_dropout                | Dropout                 | 0     
4  | transformer_layers               | Sequential              | 1.6 M 
5  | transformer_layers.0             | TransformerLayer        | 198 K 
6  | transformer_layers.0.mha         | MultiheadAttention      | 66.0 K
7  | transformer_layers.0.ln1         | LayerNorm               | 256   
8  | transformer_layers.0.ln2         | LayerNorm               | 256   
9  | transformer_layers.0.feedforward | PositionwiseFeedForward | 131 K 
10 | transformer_layers.1             | TransformerLayer        | 198 K 


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
