# Data preparation

In [1]:
from datasets import load_from_disk

def new_column(example):
    example["ner_tags"] = example["labels"]
    return example

data = load_from_disk("dataset.hf")
id_column = range(data.num_rows)
data = data.add_column("id", id_column)
data = data.map(new_column)

# Split up the data for testing and training
data = data.train_test_split(test_size=0.1)
test_data = data["test"]
data = data["train"].train_test_split(test_size=0.2)

print(data, test_data)

Loading cached processed dataset at /mnt/c/Users/perry/Documents/uni/Master/CompSem/project/ComputationalSemantics/dataset.hf/cache-cf6e09deaae4883c.arrow


DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'origin', 'id', 'ner_tags'],
        num_rows: 6654
    })
    test: Dataset({
        features: ['tokens', 'labels', 'origin', 'id', 'ner_tags'],
        num_rows: 1664
    })
}) Dataset({
    features: ['tokens', 'labels', 'origin', 'id', 'ner_tags'],
    num_rows: 925
})


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

example = data["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]', 'either', 'you', 'or', 'i', 'am', 'wrong', '.', '[SEP]']

In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx]) # Label all tokens of a given word
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_data = data.map(tokenize_and_align_labels, batched=True)
tokenized_test_data = test_data.map(tokenize_and_align_labels, batched=True)
tokenized_data["train"][0]

{'tokens': ['Her', 'bicycle', 'is', 'blue', '.'],
 'labels': [-100, 0, 0, 0, 0, 0, -100],
 'origin': [None, None, None, None, None],
 'id': 3851,
 'ner_tags': [0, 0, 0, 0, 0],
 'input_ids': [101, 2014, 10165, 2003, 2630, 1012, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [4]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Evaluation method

In [5]:
def filter_and_group_lists(first_list, second_list):
    result_first = []
    result_second = []
    previous_value = None
    current_group = []

    for value1, value2 in zip(first_list, second_list):
        if value1 != -100 and value1 != 0:
            if value1 != previous_value:
                if current_group:
                    result_second.append(current_group)
                current_group = [value2]
                result_first.append(value1)
            else:
                current_group.append(value2)
        previous_value = value1

    if current_group:
        result_second.append(current_group)

    return result_first, result_second

first_list = [-100, 2, 0, 1, 1, 0, 0, 2, 0, 1, 2, -100]
second_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']

filtered_first, grouped_second = filter_and_group_lists(first_list, second_list)
print("Filtered First List:", filtered_first)
print("Grouped Second List:", grouped_second)


Filtered First List: [2, 1, 2, 1, 2]
Grouped Second List: [['B'], ['D', 'E'], ['H'], ['J'], ['K']]


In [6]:
import numpy as np

def getFinalPrediction(predictions):
    predictions = np.array(predictions)
    non_zero_predictions = predictions[predictions != 0]
    counts = np.bincount(non_zero_predictions)
    if counts.size == 0:
        return 1
    return np.argmax(counts)

print(getFinalPrediction([1, 2, 2, 3, 0, 4, 9, 9, 9]))

9


In [7]:
import evaluate
import numpy as np

seqeval = evaluate.load("seqeval")

#mapping = {"Theme": 1, "Agent": 2, "Patient": 3, "Experiencer": 4, "Co-Theme": 5, "Stimulus": 6, "Location": 7, "Destination": 8}
label_list = [
    "O",
    "Theme",
    "Agent",
    "Patient",
    "Experiencer",
    "Co-Theme",
    "Stimulus",
    "Location",
    "Destination",
]

labels = [label_list[i] for i in example[f"ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_labels = []
    true_predictions = []
    i = 0
    for label, prediction in zip(labels, predictions):
        true_label, grouped_predictions = filter_and_group_lists(label, prediction)

        true_labels.append([label_list[l] for l in true_label])
        true_prediction = list(map(getFinalPrediction, grouped_predictions))

        true_predictions.append([label_list[p] for p in true_prediction])

#     print(sum(len(x) for x in true_labels))
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Training

In [8]:
mapping = {"Theme": 1, "Agent": 2, "Patient": 3, "Experiencer": 4, "Co-Theme": 5, "Stimulus": 6, "Location": 7, "Destination": 8}

label2id = {"O": 0}
label2id.update(mapping)
id2label = {v: k for k, v in label2id.items()}
print(id2label)
print(label2id)

{0: 'O', 1: 'Theme', 2: 'Agent', 3: 'Patient', 4: 'Experiencer', 5: 'Co-Theme', 6: 'Stimulus', 7: 'Location', 8: 'Destination'}
{'O': 0, 'Theme': 1, 'Agent': 2, 'Patient': 3, 'Experiencer': 4, 'Co-Theme': 5, 'Stimulus': 6, 'Location': 7, 'Destination': 8}


In [9]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=9, id2label=id2label, label2id=label2id
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream

In [13]:
# Training parameters
training_args = TrainingArguments(
    output_dir="thematic_role_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=4,
#     load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.284626,0.890338,0.872902,0.881534,0.890395
2,0.517800,0.223722,0.915309,0.898481,0.906817,0.915504
3,0.180600,0.22611,0.920016,0.905675,0.91279,0.920686
4,0.102000,0.23128,0.921042,0.904476,0.912684,0.921881
5,0.059200,0.249127,0.92567,0.910871,0.918211,0.925468


2509




2509




2509




2509




2509




KeyboardInterrupt: 

# Test on separate part of the dataset

In [11]:
trained_model = AutoModelForTokenClassification.from_pretrained("thematic_role_model/checkpoint-832")
testing_args = TrainingArguments(
    output_dir="./eval_output",
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    eval_steps=100,  # Adjust as needed
)
trainer = Trainer(
    model=trained_model,
    args=testing_args,
    train_dataset=None,
    eval_dataset=tokenized_test_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.evaluate()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1417




{'eval_loss': 0.12175111472606659,
 'eval_precision': 0.9472182596291013,
 'eval_recall': 0.9411764705882353,
 'eval_f1': 0.9441876999644507,
 'eval_accuracy': 0.9477769936485533,
 'eval_runtime': 7.3443,
 'eval_samples_per_second': 125.948,
 'eval_steps_per_second': 15.795}

# Inference
This block performs inference on a given sentence. Note that it returns a label per Bert token. There is always a special token at the beginning and end of each sentence. The predicted labels for these tokens have been removed. For most simple sentences, one word corresponds to one Bert token.

In [21]:
import torch
from transformers import AutoTokenizer

# text = "I deserve to know the truth."
text = "Tom didn't know when Mary had come to Boston."

tokenizer = AutoTokenizer.from_pretrained("thematic_role_model/checkpoint-832")
inputs = tokenizer(text, return_tensors="pt")

from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained("thematic_role_model/checkpoint-832")
with torch.no_grad():
    logits = model(**inputs).logits

predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

print(text)
print(predicted_token_class[1:-1])

Tom didn't know when Mary had come to Boston.
['Experiencer', 'O', 'O', 'O', 'O', 'O', 'Theme', 'O', 'O', 'O', 'Destination', 'O']
