In [1]:
%load_ext autoreload
%autoreload 2

In [70]:

import sys
import os
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), ".."))
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding
)
import torch.nn as nn
from certainty import (
    load_file, load_events, seed_everything, CACHE_DIR, RANDOM_SEED)
import evaluate


In [71]:
seqeval = evaluate.load('seqeval')

Using the latest cached version of the module from /home/peder/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--seqeval/541ae017dc683f85116597d48f621abc7b21b88dc42ec937c71af5415f0af63c (last modified on Thu Mar  7 10:47:46 2024) since it couldn't be found locally at evaluate-metric--seqeval, or remotely on the Hugging Face Hub.


In [74]:
data = load_file('en_train.json')

In [329]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased', local_only=True, cache_dir=CACHE_DIR)

In [336]:
samples = []
for sample in data:
    prefix = "Label event triggers: "
    sample['text'] = prefix + sample['text']
    sample['prefix_len'] = len(prefix)
    enc = tokenizer(sample['text'], add_special_tokens=False, padding='max_length', truncation=True, max_length=256, return_offsets_mapping=True)
    sample = {**enc, **sample}
    labels = [0 if (0, 0) != offset else -100 for offset in sample['offset_mapping']]
    events = sample['events']
    prefix_len = sample['prefix_len']
    
    trigger = []
    
    for event in events:
        for i, offset in enumerate(sample['offset_mapping']):
            span = event['trigger'][1][0]
            (start, end) = map(int, span.split(":"))
            start += prefix_len
            end += prefix_len
            if start == offset[0]:
                labels[i] = 1
                trigger = [i]
            elif start <= offset[0] and end >= offset[1]:
                labels[i] = 2
                trigger.append(i)
            else:
                if (trigger):
                    event['decoded_trigger'] = tokenizer.decode(sample['input_ids'][trigger[0]:trigger[-1]+1])
                trigger = []
                
    sample['labels'] = labels
    samples.append(sample)

In [337]:
label_list = ["O", "B-trigger", "I-trigger"]

label2id_trigger = {label: idx for idx, label in enumerate(label_list)}
id2label_trigger = {v: k for k, v in label2id_trigger.items()}
    

In [341]:
encoded = []
for sample in samples:
    sample = {key: sample[key] for key in ['input_ids', 'attention_mask', 'labels']}
    encoded.append(sample)


In [342]:
train_set = Dataset.from_list(encoded)

In [343]:
train_set

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6751
})

In [344]:
def compute_metrics_trigger(eval_pred):
    preds, label_ids = eval_pred
    prediction_ids = np.argmax(preds, axis=2)
    pred_labels = []
    true_labels = []
    for label_seq, pred_seq in zip(label_ids, prediction_ids):
        pred_row = []
        label_row = []
        for l, p in zip(label_seq, pred_seq):
            if l != -100:
                pred_row.append(id2label_trigger[p])
                label_row.append(id2label_trigger[l])
        pred_labels.append(pred_row)
        true_labels.append(label_row)

    results = seqeval.compute(predictions=pred_labels, references=true_labels)['trigger']
    return {"trigger_f1": results['f1'],
            "trigger_precision": results['precision'],
            "trigger_recall": results['recall']}

In [349]:
model_trigger = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=3, id2label=id2label_trigger, label2id=label2id_trigger
)

training_args_trigger = TrainingArguments(
    output_dir="../models/trigger" + str(42),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    lr_scheduler_type='reduce_lr_on_plateau',
    learning_rate=0.00005,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01
)

trainer_trigger = Trainer(
    model=model_trigger,
    args=training_args_trigger,
    train_dataset=train_set,
    eval_dataset=train_set,
    compute_metrics=compute_metrics_trigger,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]  # Stop after 3 epochs without improvement
)



Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_trigger = Trainer(


In [350]:
trainer_trigger.train()

Epoch,Training Loss,Validation Loss,Trigger F1,Trigger Precision,Trigger Recall
1,No log,0.07284,0.0,0.0,0.0
2,No log,0.061171,0.0,0.0,0.0
3,0.074600,0.051513,0.012653,0.592593,0.006395
4,0.074600,0.043078,0.010285,0.5,0.005196
5,0.050700,0.036878,0.033308,0.5375,0.017186
6,0.050700,0.032217,0.0,0.0,0.0
7,0.050700,0.0277,0.054484,0.510638,0.028777
8,0.036900,0.026402,0.0,0.0,0.0
9,0.036900,0.026141,0.085756,0.472,0.047162
10,0.029000,0.02362,0.0,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2110, training_loss=0.04668721971918621, metrics={'train_runtime': 4652.0169, 'train_samples_per_second': 14.512, 'train_steps_per_second': 0.454, 'total_flos': 4410275133312000.0, 'train_loss': 0.04668721971918621, 'epoch': 10.0})