#### Exploring the validation steps with model loss

In [15]:
import torch
from torch.nn.functional import cross_entropy
import numpy as np
from datasets import load_dataset
import pandas as pd

from transformers import AutoModelForSequenceClassification, AutoTokenizer


In [13]:
!pwd

/home/vivek/Documents/Workshop/NLP/Transformers


In [14]:

model = AutoModelForSequenceClassification.from_pretrained('models/distilbert-base-uncased-finetuned-emotion/checkpoint-500/')
tokenizer = AutoTokenizer.from_pretrained('models/distilbert-base-uncased-finetuned-emotion/checkpoint-500/')

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
def forward_pass_with_label(batch):

    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch['label'].to(device),
                                reduction="none")

        return {'loss': loss.cpu().numpy(),
                'predicted_label': pred_label.cpu().numpy()}

                
        
        

In [25]:
emotions = load_dataset('SetFit/emotion')

Using custom data configuration SetFit--emotion-e444b7640ce3116e
Found cached dataset json (/home/vivek/.cache/huggingface/datasets/SetFit___json/SetFit--emotion-e444b7640ce3116e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=False)
    
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
emotions_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [32]:
emotions_encoded['validation'] = emotions_encoded['validation'].map(forward_pass_with_label, batched=True, batch_size=16)

  0%|          | 0/125 [00:00<?, ?ba/s]

In [33]:
label_map = list(set(zip(emotions['train']['label'], emotions['train']['label_text'])))
label_map

[(0, 'sadness'),
 (3, 'anger'),
 (5, 'surprise'),
 (2, 'love'),
 (1, 'joy'),
 (4, 'fear')]

In [34]:
emotions_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['text', 'label', 'label_text', 'input_ids', 'attention_mask', 'loss', 'predicted_label'],
        num_rows: 2000
    })
})

In [None]:
emotions_encoded.set_format('pandas')
cols = ['text', 'label', 'predicted_label', 'loss']
df_test = emotions_encoded['validation'][:][cols]
df_test['label'] = df_test['label'].apply(label_int2str)
df_test['predicted_label'] = df_test['predicted_label'].apply(label_int2str)