In [1]:
# Setup notebook
!pip install datasets transformers evaluate sentencepiece accelerate

import evaluate
import numpy as np
import pandas as pd

from copy import deepcopy
from datasets import load_dataset, load_from_disk
from sklearn.metrics import accuracy_score, f1_score, hamming_loss
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

from google.colab import drive
drive.mount("/content/drive")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.

In [2]:
# Setup evaluation
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(results, threshold=0.3, pipeline=False):
    preds, labels = results
    preds = sigmoid(preds)

    # Predict argmax regardless of value
    for pred in preds:
        pred[np.argmax(pred)] = 1

    # Binarization
    preds = (preds > threshold).astype(int)
    refs = labels.astype(int)

    # Compute
    a = hamming_loss(refs, preds)
    b = accuracy_score(refs, preds)
    c = f1_score(refs, preds, average="micro")

    return {"hamming_loss": a, "accuracy": b, "f1": c}

In [3]:
# Load and split dataset
dataset = load_from_disk("/content/drive/MyDrive/semeval_splits")
train_split = dataset["train"]
test_split = dataset["test"]
coref_split = dataset["coref"]

# Load base model
model_path = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [4]:
# Configure training parameters
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/semeval_models",
    overwrite_output_dir=True,
    learning_rate=0.0001,
    num_train_epochs=5,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    report_to="none"
)

# Variation 1: Classify 22 fine-grain roles

Implementation Source: https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification

In [5]:
# V1: Preprocess and tokenize training data
classes = sorted([x for x in set(train_split["label1"]) if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [0.0 for _ in range(len(classes))]

    for label in labels:
        id = label2id[label]
        multihot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = multihot

    return sample

tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

In [6]:
# V1: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.327,0.2068,0.086298,0.086957,0.096552,3.2713,42.185,5.502
2,0.1966,0.199782,0.084321,0.101449,0.117241,3.2288,42.74,5.575
3,0.1941,0.201402,0.084321,0.101449,0.117241,3.2253,42.786,5.581
4,0.1893,0.202338,0.084321,0.101449,0.117241,3.2206,42.85,5.589
5,0.1905,0.202759,0.082345,0.115942,0.137931,3.2913,41.929,5.469


{'eval_loss': 0.2027592808008194, 'eval_hamming_loss': 0.08234519104084322, 'eval_accuracy': 0.11594202898550725, 'eval_f1': 0.13793103448275862, 'eval_runtime': 3.2711, 'eval_samples_per_second': 42.188, 'eval_steps_per_second': 5.503, 'epoch': 5.0}
{'eval_loss': 0.20270425081253052, 'eval_hamming_loss': 0.08234519104084322, 'eval_accuracy': 0.11594202898550725, 'eval_f1': 0.13793103448275862, 'eval_runtime': 3.2181, 'eval_samples_per_second': 42.882, 'eval_steps_per_second': 5.593, 'epoch': 5.0}


# Variation 2: Classify 25 types + fine-grain roles

In [7]:
# V2: Preprocess and tokenize training data
classes = set(train_split["label1"] + train_split["superlabel"])
classes = sorted([x for x in classes if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    labels = [
        sample["superlabel"],
        sample["label1"],
        sample["label2"],
        sample["label3"]
    ]
    labels = [label for label in labels if label]
    multihot = [0.0 for _ in range(len(classes))]

    for label in labels:
        id = label2id[label]
        multihot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = multihot

    return sample

tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [8]:
# V2: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.2777,0.222952,0.067826,0.0,0.453271,3.2385,42.612,5.558
2,0.2242,0.221978,0.067826,0.0,0.453271,3.2289,42.739,5.575
3,0.2216,0.218579,0.067826,0.0,0.453271,3.2285,42.744,5.575
4,0.2165,0.216569,0.067826,0.0,0.453271,3.234,42.672,5.566
5,0.2126,0.215861,0.067826,0.0,0.453271,3.3119,41.668,5.435


{'eval_loss': 0.21586087346076965, 'eval_hamming_loss': 0.06782608695652174, 'eval_accuracy': 0.0, 'eval_f1': 0.4532710280373832, 'eval_runtime': 3.2661, 'eval_samples_per_second': 42.253, 'eval_steps_per_second': 5.511, 'epoch': 5.0}
{'eval_loss': 0.21585378050804138, 'eval_hamming_loss': 0.06782608695652174, 'eval_accuracy': 0.0, 'eval_f1': 0.4532710280373832, 'eval_runtime': 3.2401, 'eval_samples_per_second': 42.591, 'eval_steps_per_second': 5.555, 'epoch': 5.0}


# Variation 3: Classify 3 types -> Classify 6/12/4 fine-grain roles

In [9]:
# V3: Preprocess and tokenize training data
classes = sorted([x for x in set(train_split["superlabel"]) if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    label = sample["superlabel"]
    onehot = [0.0 for i in range(len(classes))]
    id = label2id[label]
    onehot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = onehot

    return sample

tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

In [10]:
# V3: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.5156,0.484701,0.198068,0.702899,0.702899,3.228,42.751,5.576
2,0.492,0.493777,0.198068,0.702899,0.702899,3.2247,42.794,5.582
3,0.4932,0.487896,0.198068,0.702899,0.702899,3.2307,42.715,5.572
4,0.4943,0.481178,0.198068,0.702899,0.702899,3.2328,42.687,5.568
5,0.4918,0.481105,0.198068,0.702899,0.702899,3.2755,42.132,5.495


{'eval_loss': 0.4811045229434967, 'eval_hamming_loss': 0.19806763285024154, 'eval_accuracy': 0.7028985507246377, 'eval_f1': 0.7028985507246377, 'eval_runtime': 3.2522, 'eval_samples_per_second': 42.433, 'eval_steps_per_second': 5.535, 'epoch': 5.0}
{'eval_loss': 0.481092631816864, 'eval_hamming_loss': 0.19806763285024154, 'eval_accuracy': 0.7028985507246377, 'eval_f1': 0.7028985507246377, 'eval_runtime': 3.2107, 'eval_samples_per_second': 42.982, 'eval_steps_per_second': 5.606, 'epoch': 5.0}


# Variation 3a: Classify 6 protagonist roles

In [11]:
# V3-pros: Preprocess and tokenize training data
filtered = train_split.filter(lambda x: x["superlabel"] == "Protagonist")
classes = sorted([x for x in set(filtered["label1"]) if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [0.0 for _ in range(len(classes))]

    for label in labels:
        id = label2id[label]
        multihot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = multihot

    return sample

tokenized_train = filtered.map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Protagonist").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Protagonist").map(preprocess)

In [12]:
# V3-pros: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.5017,0.460882,0.226667,0.36,0.346154,0.6112,40.904,6.545
2,0.4625,0.459496,0.226667,0.36,0.346154,0.6184,40.426,6.468
3,0.4439,0.456042,0.226667,0.36,0.346154,0.6174,40.492,6.479
4,0.4504,0.455219,0.226667,0.36,0.346154,0.6087,41.071,6.571
5,0.4169,0.453016,0.226667,0.36,0.346154,0.6427,38.896,6.223


{'eval_loss': 0.45301613211631775, 'eval_hamming_loss': 0.22666666666666666, 'eval_accuracy': 0.36, 'eval_f1': 0.34615384615384615, 'eval_runtime': 0.6424, 'eval_samples_per_second': 38.916, 'eval_steps_per_second': 6.227, 'epoch': 5.0}
{'eval_loss': 0.4565366506576538, 'eval_hamming_loss': 0.22666666666666666, 'eval_accuracy': 0.36, 'eval_f1': 0.34615384615384615, 'eval_runtime': 0.6049, 'eval_samples_per_second': 41.33, 'eval_steps_per_second': 6.613, 'epoch': 5.0}


# Variation 3b: Classify 12 antagonist roles

In [13]:
# V3-ants: Preprocess and tokenize training data
filtered = train_split.filter(lambda x: x["superlabel"] == "Antagonist")
classes = sorted([x for x in set(filtered["label1"]) if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [0.0 for _ in range(len(classes))]

    for label in labels:
        id = label2id[label]
        multihot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = multihot

    return sample

tokenized_train = filtered.map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Antagonist").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Antagonist").map(preprocess)

In [14]:
# V3-ants: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.3381,0.296809,0.147766,0.14433,0.165049,2.2604,42.912,5.751
2,0.2906,0.297339,0.146048,0.154639,0.174757,2.2526,43.062,5.771
3,0.2841,0.288966,0.135739,0.216495,0.23301,2.2516,43.081,5.774
4,0.2767,0.282003,0.137457,0.206186,0.223301,2.256,42.997,5.762
5,0.2696,0.280307,0.140893,0.206186,0.203883,2.2927,42.309,5.67


{'eval_loss': 0.28030669689178467, 'eval_hamming_loss': 0.140893470790378, 'eval_accuracy': 0.20618556701030927, 'eval_f1': 0.20388349514563106, 'eval_runtime': 2.2984, 'eval_samples_per_second': 42.204, 'eval_steps_per_second': 5.656, 'epoch': 5.0}
{'eval_loss': 0.28074267506599426, 'eval_hamming_loss': 0.140893470790378, 'eval_accuracy': 0.1958762886597938, 'eval_f1': 0.20388349514563106, 'eval_runtime': 2.2535, 'eval_samples_per_second': 43.044, 'eval_steps_per_second': 5.769, 'epoch': 5.0}


# Variation 3c: Classify 4 innocent roles

In [15]:
# V3-inno: Preprocess and tokenize training data
filtered = train_split.filter(lambda x: x["superlabel"] == "Innocent")
classes = sorted([x for x in set(filtered["label1"]) if x])
label2id = {x:i for i, x in enumerate(classes)}
id2label = {i:x for x, i in label2id.items()}

def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [0.0 for _ in range(len(classes))]

    for label in labels:
        id = label2id[label]
        multihot[id] = 1.0

    sample = tokenizer(text, truncation=True)
    sample["labels"] = multihot

    return sample

tokenized_train = filtered.map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Innocent").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Innocent").map(preprocess)

In [16]:
# V3-ant: Instantiate and train model
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=len(classes),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

print(trainer.evaluate(eval_dataset=tokenized_test))
print(trainer.evaluate(eval_dataset=tokenized_coref))

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Hamming Loss,Accuracy,F1,Runtime,Samples Per Second,Steps Per Second
1,0.4812,0.349086,0.125,0.75,0.75,0.3622,44.173,5.522
2,0.3646,0.347005,0.125,0.75,0.75,0.3635,44.015,5.502
3,0.3633,0.345989,0.125,0.75,0.75,0.3702,43.218,5.402
4,0.3667,0.339137,0.125,0.75,0.75,0.3664,43.664,5.458
5,0.3509,0.32715,0.125,0.75,0.75,0.3968,40.325,5.041


{'eval_loss': 0.32714980840682983, 'eval_hamming_loss': 0.125, 'eval_accuracy': 0.75, 'eval_f1': 0.75, 'eval_runtime': 0.3967, 'eval_samples_per_second': 40.329, 'eval_steps_per_second': 5.041, 'epoch': 5.0}
{'eval_loss': 0.32740241289138794, 'eval_hamming_loss': 0.125, 'eval_accuracy': 0.75, 'eval_f1': 0.75, 'eval_runtime': 0.3704, 'eval_samples_per_second': 43.201, 'eval_steps_per_second': 5.4, 'epoch': 5.0}
