## Loading

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import datasets

from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (
    ElectraTokenizer,
    ElectraTokenizerFast,
    ElectraForSequenceClassification,
    ElectraConfig,
    ElectraForQuestionAnswering,
)

from helpers import (
    prepare_dataset_nli,
    prepare_train_dataset_qa,
    prepare_validation_dataset_qa,
    QuestionAnsweringTrainer,
    compute_accuracy,
)

NUM_PREPROCESSING_WORKERS = 2
pretrained_model = "google/electra-small-discriminator"
train_path = './trains/'
eval_path= './evals/'
dataset_name = 'snli_classic/'


## Device check

In [2]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 8 GPU(s) available.
Device name: Tesla K80


## Model

In [None]:
# Train Params
lr = 2e-5
num_epochs = 3
batch_size = 36*torch.cuda.device_count()
# Load the model
config = ElectraConfig.from_pretrained(pretrained_model)
config.gradient_checkpointing = False
config.use_cache = True
# config.problem_type = "multi_label_classification"
config.num_labels = 3

model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator",
    config=config
)

# Move model to device
# Create the Optimizer
optimizer = optim.AdamW(model.parameters(), lr=lr)
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5 ,6 ,7])
model.to(device)


## Dataset loading and preprocessing

In [None]:
dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)

tokenizer = ElectraTokenizerFast.from_pretrained(pretrained_model)
prepare_train_dataset = prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, 128)

train_dataset = dataset['train']
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)
train_loader = DataLoader(train_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)


Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-5ce4ee926f0e714a.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-3799cb122af96d79.arrow


## Train

In [122]:
model.train()

global_step = 0

# You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
loss = None

for epoch in range(num_epochs):
    pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

    for n, batch in enumerate(pragati):
    
        model.zero_grad()

        # Change labels depending on whether you're doing NER/Parsing (token classification)
        # or sentiment classification (sequence classification)
        # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
        labels = batch.pop('label')
        labels = labels.to(device)
        for k in batch:
            batch[k] = torch.stack(batch[k], dim=1).to(device)
        model_output = model(**batch, labels=labels)
        loss = model_output.loss
        loss.sum().backward()
        optimizer.step()
        global_step += 1

        if n % 20 == 0:
            pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

    print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

tokenizer.save_pretrained(train_path + dataset_name)
model.module.save_pretrained(train_path + dataset_name)

Epoch 0:   0%|          | 0/1908 [00:00<?, ?it/s]

Epoch 0 | Global Step: 1908 | Loss : 0.46


Epoch 1:   0%|          | 0/1908 [00:00<?, ?it/s]

Epoch 1 | Global Step: 3816 | Loss : 0.37


Epoch 2:   0%|          | 0/1908 [00:00<?, ?it/s]

Epoch 2 | Global Step: 5724 | Loss : 0.34
Completed Training at Epoch 2 | Global Step: 5724 | Loss : 0.34


## Evaluation

In [3]:
dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)

batch_size = 1 #36*torch.cuda.device_count()
model = ElectraForSequenceClassification.from_pretrained(train_path + dataset_name)
tokenizer = ElectraTokenizerFast.from_pretrained(train_path + dataset_name)

Reusing dataset snli (/home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-4eacbddb81939caf.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-ab24f4c6f0ce7e93.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-a1c734a8e7f96896.arrow


In [11]:
prepare_train_dataset = prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, 128)

eval_dataset = dataset['validation']
eval_dataset_featurized = eval_dataset.map(
    prepare_eval_dataset,
    batched=False,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=['label'] #eval_dataset.column_names
)
# eval_loader = DataLoader(eval_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)

In [12]:
for k in eval_dataset_featurized:
    pass
    break

In [13]:
k.keys()

dict_keys(['premise', 'hypothesis', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])

TypeError: unhashable type: 'list'

In [20]:
model_inputs['input_ids'].reshape(1,-1).shape

torch.Size([1, 128])

In [22]:
model_inputs

{'input_ids': tensor([[  101,  2048,  2308,  2024, 23581,  2096,  3173,  2000,  2175, 14555,
           1012,   102,  1996,  5208,  2024, 17662,  9119,  2096,  3173,  2000,
           2175, 14555,  2044,  2074,  5983,  6265,  1012,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [24]:
sample

{'premise': 'Two women are embracing while holding to go packages.',
 'hypothesis': 'The sisters are hugging goodbye while holding to go packages after just eating lunch.',
 'input_ids': [101,
  2048,
  2308,
  2024,
  23581,
  2096,
  3173,
  2000,
  2175,
  14555,
  1012,
  102,
  1996,
  5208,
  2024,
  17662,
  9119,
  2096,
  3173,
  2000,
  2175,
  14555,
  2044,
  2074,
  5983,
  6265,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,


In [25]:
# loss = nn.CrossEntropyLoss()
model_keys = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
model.to(device)
for n, sample in enumerate(eval_dataset_featurized):

    # Change labels depending on whether you're doing NER/Parsing (token classification)
    # or sentiment classification (sequence classification)
    # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
    model_inputs = {k:torch.LongTensor(v).to(device).reshape(1,-1) for k,v in sample.items() if k in model_keys}
    model_output = model(**model_inputs)
    
    loss = model_output.loss
    break

In [28]:
loss

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [15]:
labels

1

In [1]:
batch['input_ids']

NameError: name 'batch' is not defined