<a href="https://colab.research.google.com/github/TheRadDani/Pytorch-Lightning-DataModule/blob/main/PytorchLighningDataModule.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! pip install datasets --quiet

In [4]:
import datasets
import pandas as pd
from datasets import load_dataset

In [5]:
cola_dataset = load_dataset("glue", "cola")

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/251k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/37.6k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/37.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8551 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1043 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1063 [00:00<?, ? examples/s]

In [6]:
cola_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [7]:
train_dataset = cola_dataset["train"]
val_dataset = cola_dataset["validation"]
test_dataset = cola_dataset["test"]

In [9]:
len(train_dataset), len(val_dataset), len(test_dataset)

(8551, 1043, 1063)

In [10]:
train_dataset[0]

{'sentence': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1,
 'idx': 0}

In [11]:
train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['unacceptable', 'acceptable'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [18]:
train_dataset.filter(lambda example: example["label"] == train_dataset.features['label'].str2int('unacceptable'))[:5]

{'sentence': ['They drank the pub.',
  'The professor talked us.',
  'We yelled ourselves.',
  'We yelled Harry hoarse.',
  'Harry coughed himself.'],
 'label': [0, 0, 0, 0, 0],
 'idx': [18, 20, 22, 23, 25]}

## Tokenizing

In [19]:
from transformers import AutoTokenizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [20]:
tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

config.json:   0%|          | 0.00/382 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [21]:
train_dataset = cola_dataset["train"]
val_dataset = cola_dataset["validation"]
test_dataset = cola_dataset["test"]

In [22]:
tokenizer

BertTokenizerFast(name_or_path='google/bert_uncased_L-2_H-128_A-2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [24]:
print(train_dataset[0]["sentence"])
tokenizer(train_dataset[0]["sentence"])

Our friends won't buy this analysis, let alone the next one we propose.


{'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [27]:
tokenizer.decode(tokenizer(train_dataset[0]["sentence"])['input_ids'])

"[CLS] our friends won't buy this analysis, let alone the next one we propose. [SEP]"

In [28]:
def encode(examples):
  return tokenizer(
      examples["sentence"],
      truncation=True,
      padding="max_length",
      max_length=512,
  )

In [33]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 8551
})

In [34]:
train_dataset = train_dataset.map(encode, batched=True)
train_dataset

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 8551
})

In [30]:
import torch

In [35]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [36]:
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

In [37]:
next(iter(dataloader))

{'label': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
         1, 0, 0, 1, 1, 1, 1, 1]),
 'input_ids': tensor([[  101,  2256,  2814,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         [  101,  2028,  2062,  ...,     0,     0,     0],
         ...,
         [  101,  5965, 12808,  ...,     0,     0,     0],
         [  101,  2198, 10948,  ...,     0,     0,     0],
         [  101,  3021, 24471,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [40]:
for batch in dataloader:
  print(batch['input_ids'].shape, batch['attention_mask'].shape, batch['label'].shape)

torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32])
torch.Size([32, 512]) to

In [42]:
! pip install pytorch-lightning --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/815.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m809.0/815.2 kB[0m [31m26.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m815.2/815.2 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/890.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.5/890.5 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [79]:
import torch
import datasets
import pytorch_lightning as pl

from datasets import load_dataset
from transformers import AutoTokenizer

class DataModule(pl.LightningDataModule):
  def __init__(self, model_name:str="google/bert_uncased_L-2_H-128_A-2", batch_size:int = 32):
    super(DataModule, self).__init__()

    self.batch_size = batch_size
    self._model_name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(self._model_name)

  def prepare_data(self) -> None:
    cola_dataset = load_dataset("glue", "cola")
    self.train_dataset = cola_dataset["train"]
    self.val_dataset = cola_dataset["validation"]
    self.test_dataset = cola_dataset["test"]

  def tokenize_data(self, examples):
    return self.tokenizer(
        examples["sentence"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

  def setup(self, stage:str=None) -> None:
    if stage == "fit" or stage is None:

      self.train_dataset = self.train_dataset.map(self.tokenize_data, batched=True)
      self.train_dataset.set_format(
          type="torch", columns=["input_ids", "attention_mask", "label"]
      )

      self.test_dataset = self.test_dataset.map(self.tokenize_data, batched=True)
      self.test_dataset.set_format(
          type="torch", columns=["input_ids", "attention_mask", "label"]
      )

      self.val_dataset = self.val_dataset.map(self.tokenize_data, batched=True)
      self.val_dataset.set_format(
          type="torch", columns=["input_ids", "attention_mask", "label"]
      )

  def train_dataloader(self):

    return torch.utils.data.DataLoader(
        self.train_dataset, batch_size=self.batch_size, shuffle=True
    )

  def test_dataloader(self):

    return torch.utils.data.DataLoader(
        self.test_dataset, batch_size=self.batch_size, shuffle=False
    )

  def val_dataloader(self):

    return torch.utils.data.DataLoader(
        self.val_dataset, batch_size=self.batch_size, shuffle=False
    )

  def __str__(self):
    return f"DataModule(model_name={self._model_name}, batch_size={self.batch_size})"

data_model = DataModule()
print(data_model)
data_model.prepare_data()
data_model.setup()
print(next(iter(data_model.train_dataloader()))["input_ids"].shape)



DataModule(model_name=google/bert_uncased_L-2_H-128_A-2, batch_size=32)


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

torch.Size([32, 512])


In [91]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from transformers import AutoModel
from sklearn.metrics import accuracy_score

class ColaModel(pl.LightningModule):
  def __init__(self, model_name="google/bert_uncased_L-2_H-128_A-2", lr=1e-2):
    super(ColaModel, self).__init__()
    self.save_hyperparameters()

    self.bert = AutoModel.from_pretrained(model_name)
    self.W = nn.Linear(self.bert.config.hidden_size, 2)
    self.num_classes = 2

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

    h_cls = outputs.last_hidden_state[:, 0]
    logits = self.W(h_cls)
    return logits

  def training_step(self, batch, batch_idx):
    logits = self.forward(batch["input_ids"], batch["attention_mask"])
    loss = F.cross_entropy(logits, batch["label"])
    self.log("train_loss", loss, prog_bar=True)
    return loss

  def validation_step(self, batch, batch_idx):
    logits = self.forward(batch["input_ids"], batch["attention_mask"])
    loss = F.cross_entropy(logits, batch["label"])
    _, preds = torch.max(logits,dim=1)
    val_acc = accuracy_score(preds.cpu().numpy(), batch["label"].cpu().numpy())
    self.log("val_loss", loss, prog_bar=True)
    self.log("val_acc", val_acc, prog_bar=True)

  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [92]:
model = ColaModel()

trainer = pl.Trainer(max_epochs=3)

trainer.fit(model, data_model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/1063 [00:00<?, ? examples/s]

Map:   0%|          | 0/1043 [00:00<?, ? examples/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name | Type      | Params | Mode 
-------------------------------------------
0 | bert | BertModel | 4.4 M  | eval 
1 | W    | Linear    | 258    | train
-------------------------------------------
4.4 M     Trainable params
0         Non-trainable params
4.4 M     Total params
17.545    Total estimated model params size (MB)
1         Modules in train mode
48        Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.
