In [5]:
from pytorch_lightning.demos import Transformer,WikiText2
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader,random_split

In [7]:
from typing import Any


class LanguageModel(pl.LightningModule):
    def __init__(self,vocab_size):
        super().__init__()
        self.model=Transformer(vocab_size=vocab_size)
    def forward(self,batch,batch_idx):
        input,target=batch
        output=self.model(input,target)
        loss=F.nll_loss(output,target.view(-1))
        return loss
    def training_step(self,batch,batch_idx):
        loss=self(batch,batch_idx)
        self.log('tran_loss',loss,prog_bar=True)
        return loss
    def validation_step(self, batch, batch_idx):
        loss=self(batch,batch_idx)
        self.log('val_loss',loss,prog_bar=True)
        return loss
    def test_step(self, batch,batch_idx):
        loss=self(batch,batch_idx)
        self.log('test_loss',loss,prog_bar=True)
        return loss
    def configure_optimizers(self):
        return torch.optim.SGD(self.parameters(),lr=0.1)


In [8]:
pl.seed_everything(42)
dataset=WikiText2()
n=len(dataset)
train_set,val_set,test_set=random_split(dataset,[n-4000,2000,2000])

Seed set to 42


In [11]:
train_loader=DataLoader(train_set,batch_size=128,shuffle=True,num_workers=4)
val_loader=DataLoader(val_set,batch_size=128,shuffle=False,num_workers=2)
test_loader=DataLoader(test_set,batch_size=128,shuffle=False,num_workers=2)

In [12]:
model=LanguageModel(vocab_size=dataset.vocab_size)
trainer=pl.Trainer(gradient_clip_val=0.25,max_epochs=5)
trainer.fit(model,train_dataloaders=train_loader,val_dataloaders=val_loader)

Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name  | Type        | Params
--------------------------------------
0 | model | Transformer | 14.6 M
--------------------------------------
14.6 M    T

Epoch 4: 100%|██████████| 435/435 [00:17<00:00, 24.46it/s, v_num=9, tran_loss=4.890, val_loss=4.710]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 435/435 [00:18<00:00, 24.15it/s, v_num=9, tran_loss=4.890, val_loss=4.710]


In [13]:
trainer.test(model,test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing DataLoader 0: 100%|██████████| 16/16 [00:00<00:00, 138.54it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           4.6881327629089355
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 4.6881327629089355}]