In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import datasets

from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (
    ElectraTokenizer,
    ElectraTokenizerFast,
    ElectraForSequenceClassification,
    ElectraConfig,
    ElectraForQuestionAnswering,
)

from helpers import (
    prepare_dataset_nli,
    prepare_train_dataset_qa,
    prepare_validation_dataset_qa,
    QuestionAnsweringTrainer,
    compute_accuracy,
)

NUM_PREPROCESSING_WORKERS = 2


In [5]:
!ls

datasets    __pycache__  requirements.txt  trained_model
helpers.py  README.md	 run.py		   train.ipynb


In [5]:
dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)


Reusing dataset snli (/home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-4eacbddb81939caf.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-ab24f4c6f0ce7e93.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-a1c734a8e7f96896.arrow


In [112]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 8 GPU(s) available.
Device name: Tesla K80


In [120]:
# Train Params
lr = 2e-5
num_epochs = 3
batch_size = 36*torch.cuda.device_count()
pretrained_model = "google/electra-small-discriminator"

# Load the model
config = ElectraConfig.from_pretrained(pretrained_model)
config.gradient_checkpointing = False
config.use_cache = True
# config.problem_type = "multi_label_classification"
config.num_labels = 3
  # opposite of whatever you set GC to
# tokenizer = ElectraTokenizer.from_pretrained(pretrained_model)
# Seq2Seq models are much cleaner in this regard
# In those cases, you can simply set GC and use_cache in the final class itself
model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator",
    config=config
)
# model = ElectraForSequenceClassification.from_pretrained(pretrained_model, config=config, problem_type="multi_label_classification")

# Move model to device


# Create the Optimizer
optimizer = optim.AdamW(model.parameters(), lr=lr)

model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5 ,6 ,7])

model.to(device)


Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier

DataParallel(
  (module): ElectraForSequenceClassification(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(30522, 128, padding_idx=0)
        (position_embeddings): Embedding(512, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
      (encoder): ElectraEncoder(
        (layer): ModuleList(
          (0): ElectraLayer(
            (attention): ElectraAttention(
              (self): ElectraSelfAttention(
                (query): Linear(in_features=256, out_features=256, bias=True)
                (key): Linear(in_features=256, out_features=256, bias=True)
                (value): Linear(in_features=256, out_features=256, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
          

In [121]:
tokenizer = ElectraTokenizerFast.from_pretrained(pretrained_model)
prepare_train_dataset = prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, 128)

train_dataset = dataset['train']
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)
train_loader = DataLoader(train_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)


Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-5ce4ee926f0e714a.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-3799cb122af96d79.arrow


In [107]:
# test_loss = nn.CrossEntropyLoss(reduction='none')
loss

tensor([1.0753, 1.1108, 1.0901, 1.1041, 1.1014], device='cuda:0',
       grad_fn=<GatherBackward>)

In [122]:
model.train()

global_step = 0

# You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
loss = None

for epoch in range(num_epochs):
    pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

    for n, batch in enumerate(pragati):
    
        model.zero_grad()

        # Change labels depending on whether you're doing NER/Parsing (token classification)
        # or sentiment classification (sequence classification)
        # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
        labels = batch.pop('label')
        labels = labels.to(device)
        for k in batch:
            batch[k] = torch.stack(batch[k], dim=1).to(device)
        model_output = model(**batch, labels=labels)
        loss = model_output.loss
        loss.sum().backward()
        optimizer.step()
        global_step += 1

        if n % 20 == 0:
            pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

    print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

Epoch 0:   0%|          | 0/1908 [00:00<?, ?it/s]

In [29]:
torch.stack(batch[k])

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [20]:
torch.LongTensor(batch[k])

TypeError: only integer tensors of a single element can be converted to an index

In [49]:
next(iter(train_loader))['id']

['5726199989a1e219009ac25c',
 '57299a5eaf94a219006aa54d',
 '5706bbe32eaba6190074accb',
 '57305ba3396df919000960ae',
 '5726884cf1498d1400e8e2f6',
 '5728fce01d04691400778f1d',
 '57265d79708984140094c3ad',
 '56cc60186d243a140015ef5d',
 '572f6d0aa23a5019007fc603',
 '5728d5b54b864d1900164f12',
 '57318193497a881900248f9c',
 '57278c0ddd62a815002e9ffc',
 '57325e6d0fdd8d15006c6a44',
 '56ce5a8faab44d1400b886e4',
 '57097697ed30961900e84192',
 '571cf8175efbb31900334e68',
 '56e6de2cde9d3714000680b4',
 '57327ed206a3a419008aca8b',
 '5705fcd775f01819005e783a',
 '5726d78cdd62a815002e9221',
 '56e11684cd28a01900c67584',
 '57100c2ea58dae1900cd680a',
 '570e16ea0b85d914000d7cbb',
 '5727f8424b864d19001640e1',
 '57361c88012e2f140011a1aa',
 '57279543f1498d1400e8fcc9',
 '570d77d0fed7b91900d4617b',
 '57270130708984140094d84c',
 '570d77cdb3d812140066d9ba',
 '5726925ef1498d1400e8e422',
 '570afbdf6b8089140040f678',
 '5727d82a3acd2414000dedec']

In [None]:
def train_loop(model, optim, dataset, epochs, batch_size):

    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.create_batch, pin_memory=True)
    model.train()

    global_step = 0

    # You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
    loss = None

    for epoch in range(epochs):
        pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

        for n, batch in enumerate(pragati):
        
            model.zero_grad()

            # Change labels depending on whether you're doing NER/Parsing (token classification)
            # or sentiment classification (sequence classification)
            # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html

            loss = model(input_ids=batch["input_ids"].to(device), labels=batch["labels"].unsqueeze(1).to(device)).loss
            loss.backward()
            optim.step()
            global_step += 1

            if n % 20 == 0:
                pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

        print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')
    
    print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')