In [1]:
!pip install datasets==2.16.0 transformers lightning torchmetrics -q

In [1]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/imdb")

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
import torch
import torch.nn as nn
import lightning as L
from torch.utils.data import DataLoader,Dataset
import torchmetrics
from lightning.pytorch.loggers import CSVLogger
import os
import time
from torch.utils.data.dataset import random_split

In [5]:
def tokenize_ds(example):
    return tokenizer(example['text'],truncation=True,padding=True,max_length=tokenizer.model_max_length,return_tensors='pt')
ds = dataset.map(tokenize_ds,batched=True,batch_size=2000)

In [6]:
ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [7]:
del dataset

In [8]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
class IMDBDataset(Dataset):
    def __init__(self,partition,dataset=ds):
        super().__init__()
        self.partition = dataset[partition]
    def __len__(self):
        return len(self.partition)
    def __getitem__(self,idx):
        return self.partition[idx]

In [10]:
class IMDB_Lightning(L.LightningDataModule):
    def __init__(self,batch_size=64):
        super().__init__()
        self.batch_size = batch_size
    def setup(self,stage: str):
        self.train_ds = IMDBDataset('train')
        self.val_ds,self.test_ds = random_split(IMDBDataset('test'),lengths=[20000,5000])
    def prepare_data(self):
        return
    def train_dataloader(self):
        return DataLoader(self.train_ds,self.batch_size,shuffle=True,drop_last=True,num_workers=10)
    def test_dataloader(self):
        return DataLoader(self.test_ds,self.batch_size,shuffle=False,num_workers=10)
    def val_dataloader(self):
        return DataLoader(self.val_ds,self.batch_size,shuffle=False)

In [12]:
class LoRALayer(nn.Module):
    def __init__(self,inp_dim,out_dim,rank,alpha):
        super().__init__()
        sd = 1 / torch.tensor(rank,dtype=torch.float32) ** - 1/2
        self.A = torch.nn.Parameter(torch.randn(inp_dim,rank) * sd,requires_grad=True)
        self.B = torch.nn.Parameter(torch.zeros(rank,out_dim),requires_grad=True)
        self.alpha = alpha
    def forward(self,x):
        x = self.alpha * (x @ (self.A @ self.B))
        return x
    
class LoRALinearLayer(nn.Module):
    def __init__(self,linear,rank,alpha):
        super().__init__()
        self.linear = linear
        self.loralayer = LoRALayer(linear.in_features,linear.out_features,rank,alpha)
    def forward(self,x):
        x = self.linear(x) + self.loralayer(x)
        return x

In [11]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [12]:
# for p in model.parameters():
#     p.requires_grad = False
    
# for param in model.pre_classifier.parameters():
#     param.requires_grad = True

# for param in model.classifier.parameters():
#     param.requires_grad = True

# for p in model.distilbert.transformer.layer:
#     p.attention.q_lin = LoRALinearLayer(linear=p.attention.q_lin,rank=4,alpha=1)
#     p.attention.v_lin = LoRALinearLayer(linear=p.attention.v_lin,rank=4,alpha=1)

# model.classifier = LoRALinearLayer(linear=model.classifier,rank=4,alpha=1)

In [13]:
print("Finetuning the whole model")
print("Total params :",sum([p.numel() for p in model.parameters()]))
print("Trainable params :",sum([p.numel() for p in model.parameters() if p.requires_grad==True]))

Finetuning the whole model
Total params : 66955010
Trainable params : 66955010


In [14]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
class LightningBERT(L.LightningModule):
    def __init__(self,model,lr):
        super().__init__()
        self.lr = lr
        self.model = model
        self.training_acc = torchmetrics.Accuracy(task='multiclass',num_classes=2)
        self.val_acc = torchmetrics.Accuracy(task='multiclass',num_classes=2)
        self.test_acc = torchmetrics.Accuracy(task='multiclass',num_classes=2)
    def forward(self,input_ids,attention_mask,labels):
        return self.model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
    def training_step(self,batch,batch_idx):
        outputs = self(input_ids=batch['input_ids'],attention_mask=batch['attention_mask'],labels=batch['label'])
        loss = outputs['loss']
        logits = outputs['logits']
        preds = logits.argmax(dim=1)
        self.training_acc(preds,batch['label'])
        self.log('training_loss',loss,prog_bar=True,on_step=False,on_epoch=True,sync_dist=True)
        self.log('training_acc',self.training_acc,prog_bar=True,on_step=False,on_epoch=True,sync_dist=True)
        return loss
    def validation_step(self,batch,batch_idx):
        outputs = self(input_ids=batch['input_ids'],attention_mask=batch['attention_mask'],labels=batch['label'])
        loss = outputs['loss']
        logits = outputs['logits']
        preds = logits.argmax(dim=1)
        self.val_acc(preds,batch['label'])
        self.log('validation_loss',loss,prog_bar=True,on_step=False,on_epoch=True,sync_dist=True)
        self.log('validation_acc',self.val_acc,prog_bar=True,on_step=False,on_epoch=True,sync_dist=True)
    def test_step(self,batch,batch_idx):
        outputs = self(input_ids=batch['input_ids'],attention_mask=batch['attention_mask'],labels=batch['label'])
        loss = outputs['loss']
        logits = outputs['logits']
        preds = logits.argmax(dim=1)
        self.test_acc(preds,batch['label'])
        self.log('test_acc',self.test_acc)
    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(),lr=self.lr)
        return opt

In [16]:
numepochs = 3
# Instantiate the Lightning DataModule
dm = IMDB_Lightning(batch_size=64)
dm.setup(stage='fit')
total_steps = len(dm.train_dataloader()) * numepochs
logger = CSVLogger(save_dir="logs/", name="my-model")
# instantiate the models and define the trainer and begin training

lightning_bert = LightningBERT(model=model,lr=5e-5)
callback = L.pytorch.callbacks.ModelCheckpoint(save_top_k=1,mode='max',monitor='validation_acc',save_last=True)
trainer = L.Trainer(
    accelerator='gpu',
    devices=1,
    max_epochs=numepochs,
    callbacks=[callback],
    logger=logger,
    precision='16-mixed'
)
s = time.time()
trainer.fit(
    model=lightning_bert,
    datamodule=dm,
)
e = time.time()
test_acc = trainer.test(dataloaders=dm.test_dataloader())[0]['test_acc']
print("Finetuning the whole model")
print("Total params :",sum([p.numel() for p in model.parameters()]))
print("Trainable params :",sum([p.numel() for p in model.parameters() if p.requires_grad==True]))
print(f"Training Time : {(e - s)/60 :.2f}")

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA A10G') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                                | Params
---------------------------------------------------------------------
0 | model        | DistilBertForSequenceClassification | 67.0 M
1 | training_acc | MulticlassAccuracy                  | 0     
2 | val_acc      | MulticlassAccuracy                  | 0     
3 | test_acc     | MulticlassAccuracy                  | 0     
---------------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.
Restoring states from the checkpoint path at logs/my-model/version_28/checkpoints/epoch=1-step=780.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_28/checkpoints/epoch=1-step=780.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Finetuning the whole model
Total params : 66955010
Trainable params : 66955010
Training Time : 10.92


In [21]:
test_acc = trainer.test(dataloaders=dm.test_dataloader())[0]['test_acc']
print("Training the last two layers")
print("Total params :",sum([p.numel() for p in model.parameters()]))
print("Trainable params :",sum([p.numel() for p in model.parameters() if p.requires_grad==True]))
print(f"Training Time : {(e - s)/60 :.2f}")



MisconfigurationException: No `test_step()` method defined to run `Trainer.test`.

In [None]:
test_acc = trainer.test(dataloaders=dm.test_dataloader())[0]['test_acc']

Restoring states from the checkpoint path at logs/my-model/version_7/checkpoints/epoch=2-step=1170.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_7/checkpoints/epoch=2-step=1170.ckpt


Validation: |          | 0/? [00:00<?, ?it/s]

KeyError: 'test_acc'

In [1]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined