In [None]:
# Setup notebook
!pip install setfit datasets
!pip install transformers==4.42.2

import evaluate
import pandas as pd
import wandb

from copy import deepcopy
from datasets import load_dataset, load_from_disk
from setfit import SetFitModel, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, hamming_loss

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Disable annoying logging
wandb.init(mode="disabled")

  and should_run_async(code)
  return LooseVersion(v) >= LooseVersion(check)


In [None]:
# Load and split dataset
dataset = load_from_disk("/content/drive/MyDrive/semeval_splits")
train_split = dataset["train"]
test_split = dataset["test"]
coref_split = dataset["coref"]

# Create label lists
sups = ['Antagonist', 'Innocent', 'Protagonist']
subs = [
    'Bigot', 'Conspirator', 'Corrupt', 'Deceiver', 'Exploited',
    'Foreign Adversary', 'Forgotten', 'Guardian', 'Incompetent', 'Instigator',
    'Martyr', 'Peacemaker', 'Rebel', 'Saboteur', 'Scapegoat', 'Spy',
    'Terrorist', 'Traitor', 'Tyrant', 'Underdog', 'Victim', 'Virtuous'
]
pros = ["Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous"]
ants = [
    "Adversary", "Bigot", "Conspirator", "Corrupt", "Deceiver", "Foreign",
    "Incompetent", "Instigator", "Saboteur", "Spy", "Terrorist", "Traitor",
    "Tyrant"
]
inno = ["Exploited", "Forgotten", "Scapegoat", "Victim"]

  and should_run_async(code)


In [None]:
# Configure evaluation
def compute_metrics(preds, refs):
    a = hamming_loss(refs, preds)
    b = accuracy_score(refs, preds)
    c = f1_score(refs, preds, average="micro")

    return {"hamming_loss": a, "accuracy": b, "f1": c}

In [None]:
# Configure training parameters
args = TrainingArguments(
    batch_size=(8, 2),
    num_epochs=(1, 5),
    eval_strategy="epoch",
    logging_strategy="epoch",
    sampling_strategy="undersampling",
    save_strategy="no",
    report_to="none"
)

# Variation 1: Classify only 22 subs

Implementation Source: https://huggingface.co/docs/setfit/en/index

In [None]:
# V1: Preprocess dataset for training
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [1 if label in labels else 0 for label in subs]
    sample["labels"] = multihot

    return sample

# V1: Apply preprocessing
tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

In [None]:
# V1: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 22},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
***** Running training *****
  Num unique pairs = 24282
  Batch size = 8
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.1788,0.243693


The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 8192.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Applying column mapping to the evaluation dataset
***** Running evaluation *****
Applying column mapping to the evaluation dataset
***** Running evaluation *****


{'hamming_loss': 0.06982872200263504, 'accuracy': 0.2028985507246377, 'f1': 0.29333333333333333}
{'hamming_loss': 0.07081686429512517, 'accuracy': 0.18840579710144928, 'f1': 0.29042904290429045}


# Variation 2: Classify all 25 sups + subs

In [None]:
# V2: Preprocess dataset for training
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [
        sample["superlabel"],
        sample["label1"],
        sample["label2"],
        sample["label3"]
    ]
    labels = [label for label in labels if label]
    multihot = [1 if label in labels else 0 for label in sups + subs]
    sample["labels"] = multihot

    return sample

# V2: Apply preprocessing
tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

  and should_run_async(code)


In [None]:
# V2: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 25},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
***** Running training *****
  Num unique pairs = 140910
  Batch size = 8
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.0314,0.255328


The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 8192.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Applying column mapping to the evaluation dataset
***** Running evaluation *****
Applying column mapping to the evaluation dataset
***** Running evaluation *****


{'hamming_loss': 0.0736231884057971, 'accuracy': 0.0, 'f1': 0.4641350210970464}
{'hamming_loss': 0.07188405797101449, 'accuracy': 0.007246376811594203, 'f1': 0.4767932489451477}


# Variation 3: Classify 3 sups -> Classify 6/12/4 subs

In [None]:
# V3: Preprocess dataset for training
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [sample["superlabel"]]
    multihot = [1 if label in labels else 0 for label in sups]
    sample["labels"] = multihot

    return sample

# V3: Apply preprocessing
tokenized_train = train_split.map(preprocess)
tokenized_test = test_split.map(preprocess)
tokenized_coref = coref_split.map(preprocess)

  and should_run_async(code)


Map:   0%|          | 0/548 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

Map:   0%|          | 0/138 [00:00<?, ? examples/s]

In [None]:
# V3: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 3},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/548 [00:00<?, ? examples/s]

***** Running training *****
  Num unique pairs = 140910
  Batch size = 8
  Num epochs = 1


Epoch,Training Loss,Validation Loss
1,0.0286,0.242329


The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 8192.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Iteration:   0%|          | 0/274 [00:00<?, ?it/s]

Applying column mapping to the evaluation dataset
***** Running evaluation *****
Applying column mapping to the evaluation dataset
***** Running evaluation *****


{'hamming_loss': 0.16666666666666666, 'accuracy': 0.7318840579710145, 'f1': 0.7544483985765125}
{'hamming_loss': 0.1642512077294686, 'accuracy': 0.7391304347826086, 'f1': 0.7571428571428571}


# Variation 3a: Classify 6 protagonist roles

In [None]:
# V3-pros: Preprocess and tokenize training data
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [1 if label in labels else 0 for label in pros]
    sample["labels"] = multihot

    return sample

tokenized_train = train_split.filter(lambda x: x["superlabel"] == "Protagonist").map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Protagonist").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Protagonist").map(preprocess)

In [None]:
# V3-pros: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 6},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))

# Variation 3b: Classify 12 antagonist roles

In [None]:
# V3-ants: Preprocess and tokenize training data
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [1 if label in labels else 0 for label in ants]
    sample["labels"] = multihot

    return sample

tokenized_train = train_split.filter(lambda x: x["superlabel"] == "Antagonist").map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Antagonist").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Antagonist").map(preprocess)

In [None]:
# V3-ants: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 12},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))

# Variation 3c: Classify 4 innocent roles

In [None]:
# V3-inno: Preprocess and tokenize training data
def preprocess(sample):
    text = f"""[CLS]{sample["text"]}[SEP]{sample["mention"]}[SEP]"""
    sample["text"] = text

    labels = [sample["label1"], sample["label2"], sample["label3"]]
    labels = [label for label in labels if label]
    multihot = [1 if label in labels else 0 for label in inno]
    sample["labels"] = multihot

    return sample

tokenized_train = train_split.filter(lambda x: x["superlabel"] == "Innocent").map(preprocess)
tokenized_test = test_split.filter(lambda x: x["superlabel"] == "Innocent").map(preprocess)
tokenized_coref = coref_split.filter(lambda x: x["superlabel"] == "Innocent").map(preprocess)

In [None]:
# V3-inno: Train model
model = SetFitModel.from_pretrained(
    "Alibaba-NLP/gte-base-en-v1.5",
    multi_target_strategy="multi-output",
    use_differentiable_head=True,
    head_learning_rate=0.0001,
    hidden_dropout_prob=0.5,
    attention_probs_dropout_prob=0.5,
    head_params={"out_features": 4},
    trust_remote_code=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    metric=compute_metrics,
    column_mapping={"text": "text", "labels": "label"},
)

trainer.train()

print(trainer.evaluate(tokenized_test))
print(trainer.evaluate(tokenized_coref))