In [62]:
from datasets import load_dataset

from transformers import AutoTokenizer
import numpy as np
from transformers import TrainingArguments, Trainer

from transformers import AutoModelForSequenceClassification

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import json
from transformers import pipeline

In [3]:

dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english", trust_remote_code=True)

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sem_eval_2018_task_1.py:   0%|          | 0.00/6.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.98M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6838 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3259 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/886 [00:00<?, ? examples/s]

In [38]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [5]:
dataset['train'][0:3]

{'ID': ['2017-En-21441', '2017-En-31535', '2017-En-21068'],
 'Tweet': ["“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
  'Whatever you decide to do make sure it makes you #happy.',
  "@Max_Kellerman  it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS"],
 'anger': [False, False, True],
 'anticipation': [True, False, False],
 'disgust': [False, False, True],
 'fear': [False, False, False],
 'joy': [False, True, True],
 'love': [False, True, False],
 'optimism': [True, True, True],
 'pessimism': [False, False, False],
 'sadness': [False, False, False],
 'surprise': [False, False, False],
 'trust': [True, False, False]}

In [72]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
def preprocess_data(examples):

    ## This is supposed to be used with Batch data, 
    text = examples["Tweet"]
    
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    
    
    labels_matrix = np.zeros((len(text), len(labels)))
    # # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    
    encoding["labels"] = labels_matrix.tolist()
    
    
    return encoding

In [12]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)


Map:   0%|          | 0/6838 [00:00<?, ? examples/s]

Map:   0%|          | 0/3259 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

In [13]:
print(preprocess_data(dataset['train'][1:3]))

{'input_ids': [[101, 3649, 2017, 5630, 2000, 2079, 2191, 2469, 2009, 3084, 2017, 1001, 3407, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1030, 4098, 1035, 16155, 2386, 2009, 2036, 7126, 2008, 1996, 3484, 1997, 5088, 7748, 2003, 1999, 23606, 1012, 2070, 1997, 3021, 1051, 1005, 9848, 1005, 1055, 2377, 4214, 2001, 10166, 1010, 999, 1001, 2175, 4502, 3215, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
example = encoded_dataset['train'][1]
tokenizer.decode(example['input_ids'])


'[CLS] whatever you decide to do make sure it makes you # happy. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [15]:
encoded_dataset.set_format("torch")


In [16]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",  
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
def multi_label_metrics(eval, threshold=0.5):
    # Apply sigmoid to raw logits
    logits, labels = eval
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(logits))
    
    # Convert probabilities to binary predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    
    # Compute metrics
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(labels, y_pred, average='micro')
    accuracy = accuracy_score(labels, y_pred)
    
    # Return metrics
    metrics = {
        'f1': f1_micro_average,
        'roc_auc': roc_auc,
        'accuracy': accuracy
    }
    return metrics

# def compute_metrics(p: EvalPrediction):
#     # print(p)
#     # return None
#     logits, label  = p.predictions
#     result = multi_label_metrics(logits=preds, labels=p.label_ids)
#     return result

In [49]:
encoded_dataset['train'][0]['input_ids'].view(1,-1 )

tensor([[  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
          2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
         14354,  1001,  4105,  1001,  4737,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [50]:
input_ids = encoded_dataset['train'][0]['input_ids'].view(1,-1).to(device)
labels = encoded_dataset['train'][0]['labels'].view(1,-1).to(device)
outputs = model(input_ids=input_ids, labels=labels)


In [51]:
outputs

SequenceClassifierOutput(loss=tensor(0.6127, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-2.3106, -1.5618, -1.9044, -2.3219,  1.0570, -1.9216,  0.1789, -2.6584,
         -1.8617, -2.6606, -2.1735]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [52]:
batch_size = 128
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-multi-class-sentiment-analysis",
    evaluation_strategy = "epoch",
    report_to="none",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)



In [53]:

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=multi_label_metrics
)
     

  trainer = Trainer(


In [54]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.333577,0.659115,0.764136,0.251693
2,No log,0.325042,0.671029,0.771704,0.260722
3,No log,0.319011,0.680145,0.778184,0.268623
4,No log,0.318757,0.684467,0.781988,0.265237
5,No log,0.317469,0.686827,0.783608,0.266366


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=135, training_loss=0.291544426812066, metrics={'train_runtime': 431.2584, 'train_samples_per_second': 79.28, 'train_steps_per_second': 0.313, 'total_flos': 2249123476753920.0, 'train_loss': 0.291544426812066, 'epoch': 5.0})

In [55]:
trainer.evaluate(encoded_dataset["test"])


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 0.31458374857902527,
 'eval_f1': 0.67933390264731,
 'eval_roc_auc': 0.7780821195467488,
 'eval_accuracy': 0.2773857011353176,
 'eval_runtime': 16.5175,
 'eval_samples_per_second': 197.306,
 'eval_steps_per_second': 0.787,
 'epoch': 5.0}

In [56]:
model.save_pretrained("multi_label_bert")
tokenizer.save_pretrained("multi_label_bert")

('multi_label_bert/tokenizer_config.json',
 'multi_label_bert/special_tokens_map.json',
 'multi_label_bert/vocab.txt',
 'multi_label_bert/added_tokens.json',
 'multi_label_bert/tokenizer.json')

In [60]:
config = json.load(open("multi_label_bert/config.json"))
config["id2label"] = id2label
config["label2id"] = label2id
json.dump(config, open("multi_label_bert/config.json","w"))

In [81]:
model_fine_tuned = AutoModelForSequenceClassification.from_pretrained("multi_label_bert",  num_labels=len(labels), problem_type="multi_label_classification").to(device)
nlp = pipeline("text-classification", model=model_fine_tuned, tokenizer=tokenizer,return_all_scores=True)

example = "I'm happy I can finally train a model for multi-label classification"
ner_results = nlp(example)

print(ner_results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[[{'label': 'anger', 'score': 0.049309078603982925}, {'label': 'anticipation', 'score': 0.09762247651815414}, {'label': 'disgust', 'score': 0.06556984037160873}, {'label': 'fear', 'score': 0.03928935527801514}, {'label': 'joy', 'score': 0.9239577651023865}, {'label': 'love', 'score': 0.3200475871562958}, {'label': 'optimism', 'score': 0.7217321395874023}, {'label': 'pessimism', 'score': 0.045828986912965775}, {'label': 'sadness', 'score': 0.07204405218362808}, {'label': 'surprise', 'score': 0.063771553337574}, {'label': 'trust', 'score': 0.10227781534194946}]]




label
score
