## Loading

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import datasets
import pandas as pd
from bertviz import model_view


from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import (
    ElectraTokenizerFast,
    ElectraForSequenceClassification,
    ElectraConfig,
)

from helpers import prepare_dataset_nli

NUM_PREPROCESSING_WORKERS = 2
pretrained_model = "google/electra-small-discriminator"
train_path = './trains/'
eval_path= './evals/'
dataset_name = 'snli_classic/'


## Device check

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

There are 8 GPU(s) available.
Device name: Tesla K80


## Model

In [None]:
# Train Params
lr = 2e-5
num_epochs = 3
batch_size = 36*torch.cuda.device_count()
# Load the model
config = ElectraConfig.from_pretrained(pretrained_model)
config.gradient_checkpointing = False
config.use_cache = True
# config.problem_type = "multi_label_classification"
config.num_labels = 3

model = ElectraForSequenceClassification.from_pretrained(
    "google/electra-small-discriminator",
    config=config
)

# Move model to device
# Create the Optimizer
optimizer = optim.AdamW(model.parameters(), lr=lr)
model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5 ,6 ,7])
model.to(device)


## Dataset loading and preprocessing

In [None]:
dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)

tokenizer = ElectraTokenizerFast.from_pretrained(pretrained_model)
prepare_train_dataset = prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, 128)

train_dataset = dataset['train']
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)
train_loader = DataLoader(train_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)


## Train

In [None]:
model.train()

global_step = 0

# You can choose to do your own loss calculation if you'd like, but I'm keeping things simple here
loss = None

for epoch in range(num_epochs):
    pragati = tqdm(train_loader, desc=f'Epoch {epoch}', leave=False)

    for n, batch in enumerate(pragati):
    
        model.zero_grad()

        # Change labels depending on whether you're doing NER/Parsing (token classification)
        # or sentiment classification (sequence classification)
        # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
        labels = batch.pop('label')
        labels = labels.to(device)
        for k in batch:
            batch[k] = torch.stack(batch[k], dim=1).to(device)
        model_output = model(**batch, labels=labels)
        loss = model_output.loss
        loss.sum().backward()
        optimizer.step()
        global_step += 1

        if n % 20 == 0:
            pragati.set_description(f'Epoch: {epoch} | Step: {global_step} | Loss: {loss.mean():.2f}')

    print(f'Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

print(f'Completed Training at Epoch {epoch} | Global Step: {global_step} | Loss : {loss.mean():.2f}')

tokenizer.save_pretrained(train_path + dataset_name)
model.module.save_pretrained(train_path + dataset_name)

## Evaluation

In [3]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

dataset = datasets.load_dataset('snli')
dataset = dataset.filter(lambda ex: ex['label'] != -1)

batch_size = 1 #36*torch.cuda.device_count()
# config = ElectraConfig.from_pretrained(pretrained_model)
# config.gradient_checkpointing = False
# config.use_cache = True
# # config.problem_type = "multi_label_classification"
# config.num_labels = 3

model = ElectraForSequenceClassification.from_pretrained(train_path + dataset_name)
tokenizer = ElectraTokenizerFast.from_pretrained(train_path + dataset_name)

Reusing dataset snli (/home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-4eacbddb81939caf.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-ab24f4c6f0ce7e93.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-a1c734a8e7f96896.arrow


In [4]:
prepare_train_dataset = prepare_eval_dataset = lambda exs: prepare_dataset_nli(exs, tokenizer, 128)

eval_dataset = dataset['validation']
eval_dataset_featurized = eval_dataset.map(
    prepare_eval_dataset,
    batched=False,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=['label'] #eval_dataset.column_names
)
# eval_loader = DataLoader(eval_dataset_featurized, batch_size=batch_size, shuffle=True, pin_memory=True)

Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-c85a8663f9e40ec7.arrow
Loading cached processed dataset at /home/ml/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-04172c7c36c6282b.arrow


In [6]:
tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])
attention = model_output.attentions
# model_view(attention, tokens)

NameError: name 'sample' is not defined

In [5]:
# loss = nn.CrossEntropyLoss()
model_keys = ['input_ids', 'token_type_ids', 'attention_mask']
model.to(device)
eval_samples = []
attentions = []
for n, sample in tqdm(enumerate(eval_dataset_featurized), total=len(eval_dataset_featurized)):

    # Change labels depending on whether you're doing NER/Parsing (token classification)
    # or sentiment classification (sequence classification)
    # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
    labels = torch.LongTensor([sample['labels']]).to(device)
    model_inputs = {k:torch.LongTensor(v).to(device).reshape(1,-1) for k,v in sample.items() if k in model_keys}
    model_output = model(**model_inputs, labels=labels, output_attentions = True)
    sample['loss'] = model_output.loss.item()
    sample['predicted_label'] = model_output.logits[0].argmax().item()
    for i, logit in enumerate(model_output.logits[0]):
        sample[f'logit_{i}'] = logit.item()

    # attentions.append([att.detach() for att in  model_output.attentions])
    
    eval_samples.append(sample)


  0%|          | 0/9842 [00:00<?, ?it/s]

In [29]:
model_output.attentions[0]

tensor([[[[0.0290, 0.0259, 0.0139,  ..., 0.0000, 0.0000, 0.0000],
          [0.0027, 0.0872, 0.1164,  ..., 0.0000, 0.0000, 0.0000],
          [0.0022, 0.1771, 0.0306,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0184, 0.0145, 0.0127,  ..., 0.0000, 0.0000, 0.0000],
          [0.0173, 0.0137, 0.0107,  ..., 0.0000, 0.0000, 0.0000],
          [0.0176, 0.0146, 0.0115,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.0355, 0.0217, 0.0146,  ..., 0.0000, 0.0000, 0.0000],
          [0.0182, 0.0125, 0.0115,  ..., 0.0000, 0.0000, 0.0000],
          [0.0162, 0.0046, 0.1342,  ..., 0.0000, 0.0000, 0.0000],
          ...,
          [0.0642, 0.0164, 0.0019,  ..., 0.0000, 0.0000, 0.0000],
          [0.0570, 0.0173, 0.0020,  ..., 0.0000, 0.0000, 0.0000],
          [0.0561, 0.0149, 0.0023,  ..., 0.0000, 0.0000, 0.0000]],

         [[0.0089, 0.0189, 0.0096,  ..., 0.0000, 0.0000, 0.0000],
          [0.0102, 0.0326, 0.0286,  ..., 0.0000, 0.0000, 0.0000],
          [0.0113, 0.0249, 0.0010,  ..., 0

In [14]:
def get_attention(dataset, index, remove_pads=True):
    sample = dataset[index]
    labels = torch.LongTensor([sample['labels']]).to(device)
    model_inputs = {k:torch.LongTensor(v).to(device).reshape(1,-1) for k,v in sample.items() if k in model_keys}
    model_output = model(**model_inputs, labels=labels, output_attentions = True)
    tokens = tokenizer.convert_ids_to_tokens(sample['input_ids'])

    if remove_pads
    return tokens, model_output.attentions

In [15]:
tokens, attentions = get_attention(eval_dataset_featurized, 6208)

In [22]:
 pad_index = next(i for i, v in enumerate(tokens) if v=='[PAD]')
 pad_index


33

In [24]:
tokens[:pad_index]

['[CLS]',
 'a',
 'man',
 'wearing',
 'chuck',
 'taylor',
 'shoes',
 'and',
 'a',
 'tan',
 'shirt',
 'attempts',
 'to',
 'break',
 'free',
 'from',
 'a',
 'strait',
 '##jack',
 '##et',
 'in',
 'front',
 'of',
 'a',
 'crowd',
 '[SEP]',
 'a',
 'man',
 'in',
 'a',
 'straight',
 'jacket',
 '[SEP]']

In [6]:
eval_df = pd.DataFrame(eval_samples)
eval_df

Unnamed: 0,premise,hypothesis,input_ids,token_type_ids,attention_mask,labels,loss,predicted_label,logit_0,logit_1,logit_2
0,Two women are embracing while holding to go pa...,The sisters are hugging goodbye while holding ...,"[101, 2048, 2308, 2024, 23581, 2096, 3173, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,0.008911,1,-1.701562,3.582637,-1.969213
1,Two women are embracing while holding to go pa...,Two woman are holding packages.,"[101, 2048, 2308, 2024, 23581, 2096, 3173, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0.039708,0,2.316156,-1.212342,-2.179459
2,Two women are embracing while holding to go pa...,The men are fighting outside a deli.,"[101, 2048, 2308, 2024, 23581, 2096, 3173, 200...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0.001990,2,-3.403698,-2.081107,4.373627
3,"Two young children in blue jerseys, one with t...",Two kids in numbered jerseys wash their hands.,"[101, 2048, 2402, 2336, 1999, 2630, 28772, 101...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0.028206,0,2.583757,-1.187957,-2.602218
4,"Two young children in blue jerseys, one with t...",Two kids at a ballgame wash their hands.,"[101, 2048, 2402, 2336, 1999, 2630, 28772, 101...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,0.051390,1,-2.916604,2.854484,-0.148932
...,...,...,...,...,...,...,...,...,...,...,...
9837,A small girl wearing a pink jacket is riding o...,The carousel is moving.,"[101, 1037, 2235, 2611, 4147, 1037, 5061, 6598...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0.185306,0,2.093049,0.485987,-3.680001
9838,A small girl wearing a pink jacket is riding o...,The girl is moving at the speed of light.,"[101, 1037, 2235, 2611, 4147, 1037, 5061, 6598...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,4.598225,0,1.495437,0.530098,-2.769965
9839,A young girl with blue and pink ribbons in her...,People in a water fountain,"[101, 1037, 2402, 2611, 2007, 2630, 1998, 5061...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0.025443,0,2.707683,-1.085298,-3.024235
9840,A young girl with blue and pink ribbons in her...,A young girl knits a sweater,"[101, 1037, 2402, 2611, 2007, 2630, 1998, 5061...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0.026342,2,-2.910201,-0.834653,2.906952


In [7]:
errors = eval_df.loc[eval_df.labels != eval_df.predicted_label].sort_values('loss', ascending=False)
errors

Unnamed: 0,premise,hypothesis,input_ids,token_type_ids,attention_mask,labels,loss,predicted_label,logit_0,logit_1,logit_2
6208,A man wearing chuck taylor shoes and a tan shi...,a man in a straight jacket,"[101, 1037, 2158, 4147, 8057, 4202, 6007, 1998...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,7.182219,2,-3.280823,-1.664646,3.896800
5405,Group of guys sitting in a circle.,There are no females around them.,"[101, 2177, 1997, 4364, 3564, 1999, 1037, 4418...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,6.477018,2,-2.652690,-2.383811,4.090489
7113,A little blond girl is running near a little b...,A sister and brother are playing in their yard.,"[101, 1037, 2210, 8855, 2611, 2003, 2770, 2379...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,6.333803,1,-3.346972,2.928794,0.079304
6398,A man is standing in front of a building holdi...,Someone is holding something heavy outside.,"[101, 1037, 2158, 2003, 3061, 1999, 2392, 1997...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,6.101787,0,2.092596,0.628159,-3.798953
7818,A skydiving instructor with a student strapped...,People are jumping from a plane.,"[101, 1037, 3712, 4305, 6455, 9450, 2007, 1037...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,6.082291,2,-3.054612,-0.859570,3.004631
...,...,...,...,...,...,...,...,...,...,...,...
9678,A snowboarder in a red jacket flies through th...,The snowboarder is on a ski lift.,"[101, 1037, 4586, 6277, 2121, 1999, 1037, 2417...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,0.712748,2,-3.100070,1.255407,1.281809
5756,A bridal party is posing for a picture outdoors.,The best man is taking pictures from the bache...,"[101, 1037, 7987, 16975, 2283, 2003, 20540, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,0.710551,1,-2.896057,1.195433,1.177500
3747,Several dancers are doing leg lifts on stage.,Performers are doing a show at an event.,"[101, 2195, 10487, 2024, 2725, 4190, 13695, 20...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,0.702682,1,1.292078,1.300625,-3.257061
3319,Here are a bunch of people enjoying dinner and...,The group is socializing at the company dinner.,"[101, 2182, 2024, 1037, 9129, 1997, 2111, 9107...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,0.702483,0,1.281285,1.274376,-3.163138


In [None]:
errors.iloc[:]

In [None]:
eval_df.sort_values('loss', ascending=False)

In [None]:
accuracy = (eval_df.labels == eval_df.predicted_label).sum() / eval_df.shape[0]
accuracy

In [None]:
type(sample['labels'])

In [None]:
sample['labels']

In [None]:
torch.Tensor(sample['labels'])

In [None]:
# loss = nn.CrossEntropyLoss()
model_keys = ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
# model.to(device)
eval_samples = []
for n, sample in tqdm(enumerate(eval_dataset_featurized), total=len(eval_dataset_featurized)):

    # Change labels depending on whether you're doing NER/Parsing (token classification)
    # or sentiment classification (sequence classification)
    # ELECTRA Page linked here: https://huggingface.co/transformers/model_doc/electra.html
    model_inputs = {k:torch.LongTensor(v).reshape(1,-1) for k,v in sample.items() if k in model_keys}
    model_output = model(**model_inputs)
    sample['loss'] = model_output.loss.item()
    
    eval_samples.append()


In [None]:
sample

In [None]:
model_output.loss.item()

In [None]:
labels

In [None]:
batch['input_ids']