### Import Libraries

In [1]:
import warnings

warnings.filterwarnings("ignore")


In [2]:
import hydra
from hydra import initialize, compose
import torch
import matplotlib.pyplot as plt
import lightning as L
from lightning.pytorch.tuner import Tuner
from lightning.pytorch.callbacks import ModelSummary, ModelCheckpoint

In [3]:
from src.models.transformer import Transformer
from src.data.gpt_data import GPTDataModule


### Load Config

In [4]:
try:
    initialize(version_base="1.3", config_path=".", job_name="all")
except ValueError:
    hydra.core.global_hydra.GlobalHydra.instance().clear()
    initialize(version_base="1.3", config_path=".", job_name="all")

cfg = compose(config_name="config")


### Load the Data

In [5]:
datamodule = GPTDataModule(
    seq_len=cfg.gpt.seq_len,
    path_do_data=f"{cfg.gpt.data}/english.txt",
    batch_size=cfg.gpt.batch_size,
)
datamodule.setup()


Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


### Initilize the model

In [6]:
gpt = Transformer(
    arch="gpt",
    d_model=cfg.gpt.embed_size,
    max_len=cfg.gpt.seq_len,
    num_heads=cfg.gpt.n_heads,
    num_layers=cfg.gpt.n_layers,
    num_classes=cfg.gpt.n_vocab,
    d_ff=cfg.gpt.inner_ff_size,
    dropout=cfg.gpt.dropout,
    lr=cfg.gpt.lr,
    weight_decay=cfg.gpt.weight_decay,
    betas=cfg.gpt.betas,
)


In [7]:
callbacks = [
    ModelSummary(max_depth=3),
    ModelCheckpoint(
        dirpath=f"./model_checkpoints/gpt",
        filename="gpt_{epoch}",
        monitor="train_loss",
        mode="min",
        save_last=True,
    ),
]


### Initialize trainer

In [8]:
trainer = L.Trainer(
    accelerator="gpu",
    devices=[1],
    num_nodes=1,
    max_epochs=10,
    callbacks=callbacks,
    limit_val_batches=0,
)


Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


### Begin Training

In [9]:
trainer.fit(model=gpt, datamodule=datamodule)


You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

   | Name                             | Type                    | Params
------------------------------------------------------------------------------
0  | embedding                        | Embeddings              | 23.4 M
1  | embedding.lut                    | Embedding               | 23.4 M
2  | pe                               | PositionalEncoding      | 0     
3  | embedding_dropout                | Dropout                 | 0     
4  | transformer_layers               | Sequential              | 42.5 M
5  | transformer_layers.0             | TransformerLayer        | 7.1 M 
6  | transformer_layers.0.mha         | MultiheadAttention      | 2.4 M 
7  | transformer_layers.0.ln1         | LayerNorm               | 1.5 K 
8  | transformer_layers.0.ln2         | LayerNorm               | 1.5 K 
9  | transformer_layers.0.feedforward | PositionwiseFeedForward | 4.7 M 
10 | transformer_layers.1             | TransformerLayer        | 7.1 M 


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
