In [2]:
%%capture
!pip install transformers==4.12.5 pandas torch

In [1]:
import transformers

print(f"Running on transformers v{transformers.__version__}")

Running on transformers v4.12.5


In [2]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, DistilBertModel, DistilBertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput

In [3]:
!pip install -q datasets

In [4]:
from datasets import load_dataset
emotions = load_dataset("go_emotions", "raw")

Downloading builder script:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading and preparing dataset go_emotions/raw (download: 40.76 MiB, generated: 52.78 MiB, post-processed: Unknown size, total: 93.54 MiB) to C:\Users\aquarshie\.cache\huggingface\datasets\go_emotions\raw\0.0.0\2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/211225 [00:00<?, ? examples/s]

Dataset go_emotions downloaded and prepared to C:\Users\aquarshie\.cache\huggingface\datasets\go_emotions\raw\0.0.0\2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id', 'created_utc', 'rater_id', 'example_very_unclear', 'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral'],
        num_rows: 211225
    })
})

In [6]:
df = emotions['train'].to_pandas()

In [7]:
label_cols = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
len(label_cols)

28

In [8]:
id2label = {str(i):label for i, label in enumerate(label_cols)}
label2id = {label:str(i) for i, label in enumerate(label_cols)}

In [9]:
id2label

{'0': 'admiration',
 '1': 'amusement',
 '2': 'anger',
 '3': 'annoyance',
 '4': 'approval',
 '5': 'caring',
 '6': 'confusion',
 '7': 'curiosity',
 '8': 'desire',
 '9': 'disappointment',
 '10': 'disapproval',
 '11': 'disgust',
 '12': 'embarrassment',
 '13': 'excitement',
 '14': 'fear',
 '15': 'gratitude',
 '16': 'grief',
 '17': 'joy',
 '18': 'love',
 '19': 'nervousness',
 '20': 'optimism',
 '21': 'pride',
 '22': 'realization',
 '23': 'relief',
 '24': 'remorse',
 '25': 'sadness',
 '26': 'surprise',
 '27': 'neutral'}

In [10]:
label2id

{'admiration': '0',
 'amusement': '1',
 'anger': '2',
 'annoyance': '3',
 'approval': '4',
 'caring': '5',
 'confusion': '6',
 'curiosity': '7',
 'desire': '8',
 'disappointment': '9',
 'disapproval': '10',
 'disgust': '11',
 'embarrassment': '12',
 'excitement': '13',
 'fear': '14',
 'gratitude': '15',
 'grief': '16',
 'joy': '17',
 'love': '18',
 'nervousness': '19',
 'optimism': '20',
 'pride': '21',
 'realization': '22',
 'relief': '23',
 'remorse': '24',
 'sadness': '25',
 'surprise': '26',
 'neutral': '27'}

## Pre-process data

In [11]:
df["labels"] = df[label_cols].values.tolist()
df.head()

Unnamed: 0,text,id,author,subreddit,link_id,parent_id,created_utc,rater_id,example_very_unclear,admiration,...,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,labels
0,That game hurt.,eew5j0j,Brdd9,nrl,t3_ajis4z,t1_eew18eq,1548381000.0,1,False,0,...,0,0,0,0,0,0,1,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,>sexuality shouldn’t be a grouping category I...,eemcysk,TheGreen888,unpopularopinion,t3_ai4q37,t3_ai4q37,1548084000.0,37,False,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"You do right, if you don't care then fuck 'em!",ed2mah1,Labalool,confessions,t3_abru74,t1_ed2m7g7,1546428000.0,37,False,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Man I love reddit.,eeibobj,MrsRobertshaw,facepalm,t3_ahulml,t3_ahulml,1547965000.0,18,False,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[NAME] was nowhere near them, he was by the Fa...",eda6yn6,American_Fascist713,starwarsspeculation,t3_ackt2f,t1_eda65q2,1546669000.0,2,False,0,...,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
# take sample for quick prototyping
df_sample = df.sample(n=1000)
df_sample.shape

(1000, 38)

In [13]:
# create train / test splits
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]

(df_train.shape, df_test.shape)

((168693, 38), (42532, 38))

## Tokenize and encode

In [14]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [15]:
train_encodings = tokenizer(df_train["text"].values.tolist(), truncation=True)
test_encodings = tokenizer(df_test["text"].values.tolist(), truncation=True)

In [16]:
train_labels = df_train["labels"].values.tolist()
test_labels = df_test["labels"].values.tolist()

In [17]:
class GoEmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [18]:
train_dataset = GoEmotionDataset(train_encodings, train_labels)
test_dataset = GoEmotionDataset(test_encodings, test_labels)

In [19]:
train_dataset[0]

{'input_ids': tensor([  101,  1028, 13798,  5807,  1521,  1056,  2022,  1037, 19765,  4696,
          2009,  3084,  2017,  2367,  2013, 27178, 27065,  4903,  2140,  2061,
         10047,  2080,  2009, 16142,  1996,  6210,  1997,  1000, 19765,  1000,
           102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]),
 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0])}

In [20]:
# sanity check
tokenizer.decode(train_dataset[0]["input_ids"])

'[CLS] > sexuality shouldn ’ t be a grouping category it makes you different from othet ppl so imo it fits the definition of " grouping " [SEP]'

## Fine-tuning

There are two ways we can implement multi-label classification:

- Creating a custom BERT model that overrides the forward method
- Creating a custom Trainer that overrides the compute_loss method
The second method does not work with v4.2.1 of transformers due to some bugs, so we'll work with the first approach instead :)

### Creating a Custom Model

In [21]:
class DistilBertForMultilabelSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
      super().__init__(config)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.distilbert(input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)

        hidden_state = outputs[0]
        pooled_output = hidden_state[:, 0]  
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = torch.nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), 
                            labels.float().view(-1, self.num_labels))

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

In [23]:
num_labels=28
model = DistilBertForMultilabelSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)#.to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForMultilabelSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForMultilabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForMultilabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.wei

In [24]:
model.config.id2label = {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",
    "15": "gratitude",
    "16": "grief",
    "17": "joy",
    "18": "love",
    "19": "nervousness",
    "20": "optimism",
    "21": "pride",
    "22": "realization",
    "23": "relief",
    "24": "remorse",
    "25": "sadness",
    "26": "surprise",
    "27": "neutral"
  },
model.config.label2id ={
    "admiration": 0,
    "amusement": 1,
    "anger": 2,
    "annoyance": 3,
    "approval": 4,
    "caring": 5,
    "confusion": 6,
    "curiosity": 7,
    "desire": 8,
    "disappointment": 9,
    "disapproval": 10,
    "disgust": 11,
    "embarrassment": 12,
    "excitement": 13,
    "fear": 14,
    "gratitude": 15,
    "grief": 16,
    "joy": 17,
    "love": 18,
    "nervousness": 19,
    "neutral": 27,
    "optimism": 20,
    "pride": 21,
    "realization": 22,
    "relief": 23,
    "remorse": 24,
    "sadness": 25,
    "surprise": 26
  }
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": [
    {
      "0": "admiration",
      "1": "amusement",
      "10": "disapproval",
      "11": "disgust",
      "12": "embarrassment",
      "13": "excitement",
      "14": "fear",
      "15": "gratitude",
      "16": "grief",
      "17": "joy",
      "18": "love",
      "19": "nervousness",
      "2": "anger",
      "20": "optimism",
      "21": "pride",
      "22": "realization",
      "23": "relief",
      "24": "remorse",
      "25": "sadness",
      "26": "surprise",
      "27": "neutral",
      "3": "annoyance",
      "4": "approval",
      "5": "caring",
      "6": "confusion",
      "7": "curiosity",
      "8": "desire",
      "9": "disappointment"
    }
  ],
  "initializer_range": 0.02,
  "label2id": {
    "admiration": 0,
    "amusement": 1

In [25]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}
batch_size = 32

In [27]:
batch_size = 32
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="emotion",
    evaluation_strategy = "epoch",
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=logging_steps
)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [29]:
# sanity check that we can run evaluation
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 42532
  Batch size = 32


{'eval_loss': 0.7321420311927795,
 'eval_accuracy_thresh': 0.42745545506477356,
 'eval_runtime': 1090.194,
 'eval_samples_per_second': 39.013,
 'eval_steps_per_second': 1.22}

In [30]:
trainer.train()

***** Running training *****
  Num examples = 168693
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 15816


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# sanity check that we can run evaluation
trainer.evaluate()