In [71]:
import torch
import torch.nn as nn
import torch.optim as optim
import datasets

from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (
    ElectraTokenizer,
    ElectraTokenizerFast,
    ElectraForSequenceClassification,
    ElectraConfig,
    ElectraForQuestionAnswering,
)

from helpers import (
    prepare_dataset_nli,
    prepare_train_dataset_qa,
    prepare_validation_dataset_qa,
    QuestionAnsweringTrainer,
    compute_accuracy,
)

NUM_PREPROCESSING_WORKERS = 2


In [5]:
!ls

datasets    __pycache__  requirements.txt  trained_model
helpers.py  README.md	 run.py		   train.ipynb


In [35]:
# dataset = datasets.load_dataset('squad')
dataset = datasets.load_dataset(
    "json",
    data_files={
        "train": "./datasets/squad/train.json",
        "validation": "./datasets/squad/validation.json",
    },
)


Using custom data configuration default-9147b4002b1d77ef


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/ml/.cache/huggingface/datasets/json/default-9147b4002b1d77ef/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264...


0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /home/ml/.cache/huggingface/datasets/json/default-9147b4002b1d77ef/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264. Subsequent calls will reuse this data.


In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 8 GPU(s) available.
Device name: Tesla K80


In [72]:
# Train Params
lr = 2e-5
num_epochs = 3
batch_size = 32
pretrained_model = 'google/electra-small-discriminator'

# Load the model
config = ElectraConfig.from_pretrained(pretrained_model)
config.gradient_checkpointing = False
config.use_cache = True # opposite of whatever you set GC to
# tokenizer = ElectraTokenizer.from_pretrained(pretrained_model)
# Seq2Seq models are much cleaner in this regard
# In those cases, you can simply set GC and use_cache in the final class itself
model = ElectraForQuestionAnswering.from_pretrained(pretrained_model, config=config)

# Move model to device
model.to(device)
    
# Create the Optimizer
optimizer = optim.AdamW(model.parameters(), lr=lr)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['qa_outputs.bias', 'qa_outputs.

In [58]:
tokenizer

PreTrainedTokenizer(name_or_path='google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [73]:
tokenizer = ElectraTokenizerFast.from_pretrained(pretrained_model)
prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)

train_dataset = dataset['train']
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)
train_loader = DataLoader(train_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)


Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/json/default-9147b4002b1d77ef/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264/cache-2ceb25745604c0e8.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/json/default-9147b4002b1d77ef/0.0.0/45636811569ec4a6630521c18235dfbbab83b7ab572e3393c5ba68ccabe98264/cache-f2ce53a15165dd33.arrow


In [76]:
batch['input_ids']

[tensor([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
         101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
         101, 101, 101, 101]),
 tensor([2129, 2054, 2054, 2054, 2040, 1999, 2054, 2054, 1999, 2040, 2029, 2054,
         2054, 2129, 2043, 2054, 1996, 2054, 2029, 2043, 2054, 2040, 2054, 2006,
         2339, 2054, 1996, 2054, 2043, 2054, 2040, 2043]),
 tensor([ 2116,  2177,  2003,  5455,  2471,  2054,  4127,  2828,  1996,  2001,
         15139,  2003,  2177,  2116,  2097,  2003,  4125,  2001,  2177,  2079,
          2181,  2003,  2001,  2054,  2106,  3703, 10900,  2003,  2020,  3842,
          6430,  2106]),
 tensor([ 8817,  7164,  1037,  8678,  3856,  2095,  1997,  1997,  2051,  1996,
          9329,  1996,  2001, 25177,  2231,  1996,  1997,  1996,  2580,  4639,
          2038,  1996,  1996,  4127,  2027, 15821, 10836,  1996,  1996,  2106,
          1996, 12075]),
 tensor([ 1997,  1996,  2171,  2007,  2039,  2001,  2495,  7876,  

In [69]:
type(model)

transformers.models.electra.modeling_electra.ElectraForSequenceClassification

In [74]:
model.train()

global_step = 0

# You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
loss = None

for epoch in range(num_epochs):
    pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

    for n, batch in enumerate(pragati):
    
        model.zero_grad()

        # Change labels depending on whether you're doing NER/Parsing (token classification)
        # or sentiment classification (sequence classification)
        # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html

        model_output = model(**batch)
        loss = None
        loss.backward()
        optim.step()
        global_step += 1

        if n % 20 == 0:
            pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

    print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

Epoch 0:   0%|          | 0/2742 [00:00<?, ?it/s]

AttributeError: 'list' object has no attribute 'size'

In [48]:
train_loader['id']

TypeError: 'DataLoader' object is not subscriptable

In [49]:
next(iter(train_loader))['id']

['5726199989a1e219009ac25c',
 '57299a5eaf94a219006aa54d',
 '5706bbe32eaba6190074accb',
 '57305ba3396df919000960ae',
 '5726884cf1498d1400e8e2f6',
 '5728fce01d04691400778f1d',
 '57265d79708984140094c3ad',
 '56cc60186d243a140015ef5d',
 '572f6d0aa23a5019007fc603',
 '5728d5b54b864d1900164f12',
 '57318193497a881900248f9c',
 '57278c0ddd62a815002e9ffc',
 '57325e6d0fdd8d15006c6a44',
 '56ce5a8faab44d1400b886e4',
 '57097697ed30961900e84192',
 '571cf8175efbb31900334e68',
 '56e6de2cde9d3714000680b4',
 '57327ed206a3a419008aca8b',
 '5705fcd775f01819005e783a',
 '5726d78cdd62a815002e9221',
 '56e11684cd28a01900c67584',
 '57100c2ea58dae1900cd680a',
 '570e16ea0b85d914000d7cbb',
 '5727f8424b864d19001640e1',
 '57361c88012e2f140011a1aa',
 '57279543f1498d1400e8fcc9',
 '570d77d0fed7b91900d4617b',
 '57270130708984140094d84c',
 '570d77cdb3d812140066d9ba',
 '5726925ef1498d1400e8e422',
 '570afbdf6b8089140040f678',
 '5727d82a3acd2414000dedec']

In [None]:
def train_loop(model, optim, dataset, epochs, batch_size):

    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.create_batch, pin_memory=True)
    model.train()

    global_step = 0

    # You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
    loss = None

    for epoch in range(epochs):
        pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

        for n, batch in enumerate(pragati):
        
            model.zero_grad()

            # Change labels depending on whether you're doing NER/Parsing (token classification)
            # or sentiment classification (sequence classification)
            # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html

            loss = model(input_ids=batch["input_ids"].to(device), labels=batch["labels"].unsqueeze(1).to(device)).loss
            loss.backward()
            optim.step()
            global_step += 1

            if n % 20 == 0:
                pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

        print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')
    
    print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')