In [1]:
from datasets import Dataset, load_from_disk

dataset = load_from_disk("emotion_datasets/src/data/CancerEmo")
print(dataset)
print(dataset[0])

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust'],
    num_rows: 11642
})
{'text': 'And it will no doubt make me happy in the morning as well!', 'anger': False, 'anticipation': None, 'disgust': None, 'fear': None, 'joy': None, 'sadness': None, 'surprise': None, 'trust': None}


In [2]:
emotion_labels = [label for label in dataset.features.keys() if label != "text"]

def convert_labels(example):
    example["labels"] = [float(bool(example[label])) for label in emotion_labels]
    return example
dataset = dataset.map(convert_labels)
print(dataset[5])

Map: 100%|██████████| 11642/11642 [00:01<00:00, 8567.54 examples/s]

{'text': 'doctors were surprised and pleased with my range of motion at my first post-surgical follow-up.', 'anger': False, 'anticipation': None, 'disgust': None, 'fear': None, 'joy': True, 'sadness': None, 'surprise': None, 'trust': None, 'labels': [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]}





In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) # We can play a bit with this value, doesnt have to be 128
tokenized_dataset = dataset.map(tokenize_function, batched=True)

print(tokenized_dataset[0])

Map: 100%|██████████| 11642/11642 [00:00<00:00, 12340.18 examples/s]

{'text': 'And it will no doubt make me happy in the morning as well!', 'anger': False, 'anticipation': None, 'disgust': None, 'fear': None, 'joy': None, 'sadness': None, 'surprise': None, 'trust': None, 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'input_ids': [101, 1998, 2009, 2097, 2053, 4797, 2191, 2033, 3407, 1999, 1996, 2851, 2004, 2092, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 




In [4]:
from transformers import AutoModelForSequenceClassification

model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification"
                                                           , num_labels=len(tokenized_dataset[0]['labels']))

print(model.config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "multi_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  

# Step 3 options

In [5]:
# # Freeze all layers except the classifier
# for param in model.bert.parameters():
#     param.requires_grad = False

# # Keep only the classification head trainable
# for param in model.classifier.parameters():
#     param.requires_grad = True

print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

Trainable parameters: 109488392


# Back to step 2

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",         
    eval_strategy="epoch",     
    save_strategy="epoch",
    learning_rate=5e-5,          
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    num_train_epochs=2,    
    weight_decay=0.01,                
    save_total_limit=2,  
    load_best_model_at_end=True,    
    logging_dir="./logs",
    logging_steps=100, 
    fp16=True
)

print(training_args)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
def compute_metrics(pred):

    logits = pred.predictions
    labels = pred.label_ids
    probs = 1 / (1 + np.exp(-logits))         # sigmoid
    preds = (probs > 0.5).astype(int)         # threshold at 0.5 so below or above for 0-1

    exact_match = (preds == labels).all(axis=1).mean() 

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision_micro": precision_score(labels, preds, average="micro"),
        "recall_micro": recall_score(labels, preds, average="micro"),
        "subset_accuracy": exact_match
    }


In [17]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # This is a example to get batches with padding inside

In [None]:
# Making a train and test split for the dataset being 80-20
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
print(split_dataset)
print(split_dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1164
    })
    test: Dataset({
        features: ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10478
    })
})
{'text': 'On the website look for Cancer in the upper left of the main page.', 'anger': None, 'anticipation': None, 'disgust': None, 'fear': False, 'joy': None, 'sadness': None, 'surprise': None, 'trust': None, 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'input_ids': [101, 2006, 1996, 4037, 2298, 2005, 4456, 1999, 1996, 3356, 2187, 1997, 1996, 2364, 3931, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,        
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro,Precision Micro,Recall Micro,Subset Accuracy
1,No log,0.21678,0.527574,0.232491,0.715985,0.417665,0.492365
2,0.213300,0.215619,0.548739,0.262053,0.671091,0.464122,0.489979


TrainOutput(global_step=146, training_loss=0.20448290158624519, metrics={'train_runtime': 150.4452, 'train_samples_per_second': 15.474, 'train_steps_per_second': 0.97, 'total_flos': 153138883608576.0, 'train_loss': 0.20448290158624519, 'epoch': 2.0})