In [1]:
from datasets import load_dataset, load_metric
dataset = load_dataset("surrey-nlp/PLOD-CW")
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=4)

short_dataset = dataset["train"]
val_dataset = dataset["validation"]
test_dataset = dataset["test"]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [86]:
# given max token length because required not to overflow when training!
# truncation added as seems to be required for max token length
# tokenized_input = tokenizer(short_dataset["tokens"], is_split_into_words=True, max_length=512, truncation=True)
tokenized_input = tokenizer(short_dataset["tokens"], is_split_into_words=True)

# Example single sentence example.
for token in tokenized_input["input_ids"]:
    print(tokenizer.convert_ids_to_tokens(token))
    break

for idx, token in enumerate(tokenized_input["input_ids"]):
    if len(token) > 512:
        print("Tokens at idx ",idx," are longer than 512!!")
        print(print(tokenizer.convert_ids_to_tokens(token)))


['[CLS]', 'for', 'this', 'purpose', 'the', 'gothenburg', 'young', 'persons', 'empowerment', 'scale', '(', 'g', '##ype', '##s', ')', 'was', 'developed', '.', '[SEP]']
Tokens at idx  286  are longer than 512!!
['[CLS]', '(', 'e', '##gf', ',', 'ep', '##ider', '##mal', 'growth', 'factor', ';', 't', '##gf', ',', 'transforming', 'growth', 'factor', ';', 'bt', '##c', ',', 'beta', '##cel', '##lu', '##lin', ';', 'h', '##b', '-', 'e', '##gf', ',', 'he', '##par', '##in', '-', 'binding', 'ep', '##ider', '##mal', 'growth', 'factor', '(', 'e', '##gf', ')', '-', 'like', 'growth', 'factor', ';', 'er', '##eg', ',', 'ep', '##ire', '##gul', '##in', ';', 'nr', '##g', '##1', ',', 'ne', '##ure', '##gul', '##in', '-', '1', ';', 'nr', '##g', '##2', ',', 'ne', '##ure', '##gul', '##in', '-', '2', ';', 'nr', '##g', '##3', ',', 'ne', '##ure', '##gul', '##in', '-', '3', ';', 'nr', '##g', '##4', ',', 'ne', '##ure', '##gul', '##in', '-', '4', ';', 'plc', '##Î³', ',', 'ph', '##os', '##ph', '##oli', '##pas', '##e', 'c

TypeError: 'Dataset' object doesn't support item deletion

In [70]:
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}
id2label = {v:k for k,v in label_encoding.items()}

label_list = []
for sample in short_dataset["ner_tags"]:
    label_list.append([label_encoding[tag] for tag in sample])

val_label_list = []
for sample in val_dataset["ner_tags"]:
    val_label_list.append([label_encoding[tag] for tag in sample])

test_label_list = []
for sample in test_dataset["ner_tags"]:
    test_label_list.append([label_encoding[tag] for tag in sample])


In [71]:
def tokenize_and_align_labels(short_dataset, list_name):
    tokenized_inputs = tokenizer(short_dataset["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(list_name):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [72]:
tokenized_datasets = tokenize_and_align_labels(short_dataset, label_list)
tokenized_val_datasets = tokenize_and_align_labels(val_dataset, val_label_list)
tokenized_test_datasets = tokenize_and_align_labels(test_dataset, test_label_list)

In [73]:
# BERT's tokenizer returns the dataset in the form of a dictionary of lists (sentences). 
# we have to convert it into a list of dictionaries for training.
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

In [74]:
tokenised_train = turn_dict_to_list_of_dict(tokenized_datasets)
tokenised_val = turn_dict_to_list_of_dict(tokenized_val_datasets)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test_datasets)

In [75]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

In [76]:
import numpy as np

metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [77]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

# Training arguments (feel free to play arround with these values)
# model_name = "bert-base-uncased"
# epochs = 6
# batch_size = 4
# learning_rate = 2e-5

# args = TrainingArguments(
#     output_dir="distilBERT-finetuned-NER",
#     # evaluation_strategy = "epoch", ## Instead of focusing on loss and accuracy, we will focus on the F1 score
#     evaluation_strategy ='steps',
#     eval_steps = 7000,
#     save_total_limit = 3,
#     learning_rate=learning_rate,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=epochs,
#     weight_decay=0.001,
#     save_steps=35000,
#     metric_for_best_model = 'f1',
#     load_best_model_at_end=True,
#     report_to=['none'], # REQUIRED because otherwise keeps asking to log into "wandb"
# )

# trainer = Trainer(
#     model,
#     args,
#     train_dataset=tokenised_train,
#     eval_dataset=tokenised_val,
#     data_collator = data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
# )

model_output_dir:str = "distilBERT-finetuned-NER"

training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to=['none'] # REQUIRED because otherwise keeps asking to log into "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [78]:
trainer.train()

  0%|          | 0/3350 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (542) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
# Prepare the test data for evaluation in the same format as the training data

predictions, labels, _ = trainer.predict(tokenised_test)
predictions = np.argmax(predictions, axis=2)

# Remove the predictions for the [CLS] and [SEP] tokens 
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results

  0%|          | 0/10 [00:00<?, ?it/s]



{'0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 2, 3, 3, 1, 0, 1, 0, 0, 0, 0, 0, 0]': {'precision': 0.6310679611650486,
  'recall': 0.7303370786516854,
  'f1': 0.6770833333333335,
  'number': 267},
 '0, 0, 0, 0, 2, 3, 3, 3, 3, 0, 1, 0, 0, 0, 0]': {'precision': 0.5272727272727272,
  'recall': 0.5951492537313433,
  'f1': 0.5591586327782646,
  'number': 536},
 '0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0]': {'precision': 0.34234234234234234,
  'recall': 0.2550335570469799,
  'f1': 0.2923076923076923,
  'number': 149},
 '1, 0, 2, 3, 3, 0]': {'precision': 0.2532467532467532,
  'recall': 0.3023255813953488,
  'f1': 0.2756183745583039,
  'number': 129},
 'overall_precision': 0.5012722646310432,
 'overall_recall': 0.5467160037002775,
 'overall_f1': 0.523008849557522,
 'overall_accuracy': 0.8998646820027063}

In [None]:
text = "For this purpose the Gothenburg Young Persons Empowerment Scale (GYPES) was developed."
# model = AutoTokenizer.from_pretrained("./distilBERT-finetuned-NER/checkpoint-26/")
import os
from transformers import pipeline

checkpoint_list:list[str] = os.listdir(model_output_dir)
last_checkpoint:str = checkpoint_list[-1:][0]
last_checkpoint_path:str = os.path.join(model_output_dir, last_checkpoint)
classifier = pipeline("ner", model=last_checkpoint_path)
result:list[dict] = classifier(text)
for r in result:
    entity:str = r["entity"] 
    r["entity"] = id2label[int(entity.replace("LABEL_", ""))]
result



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{0: 'B-O', 1: 'B-AC', 2: 'B-LF', 3: 'I-LF'}


[{'entity': 'B-O',
  'score': 0.9985917,
  'index': 1,
  'word': 'for',
  'start': 0,
  'end': 3},
 {'entity': 'B-O',
  'score': 0.9988061,
  'index': 2,
  'word': 'this',
  'start': 4,
  'end': 8},
 {'entity': 'B-O',
  'score': 0.99888355,
  'index': 3,
  'word': 'purpose',
  'start': 9,
  'end': 16},
 {'entity': 'B-O',
  'score': 0.99921954,
  'index': 4,
  'word': 'the',
  'start': 17,
  'end': 20},
 {'entity': 'B-LF',
  'score': 0.9956285,
  'index': 5,
  'word': 'gothenburg',
  'start': 21,
  'end': 31},
 {'entity': 'I-LF',
  'score': 0.99947685,
  'index': 6,
  'word': 'young',
  'start': 32,
  'end': 37},
 {'entity': 'I-LF',
  'score': 0.9995701,
  'index': 7,
  'word': 'persons',
  'start': 38,
  'end': 45},
 {'entity': 'I-LF',
  'score': 0.9996407,
  'index': 8,
  'word': 'empowerment',
  'start': 46,
  'end': 57},
 {'entity': 'I-LF',
  'score': 0.9995134,
  'index': 9,
  'word': 'scale',
  'start': 58,
  'end': 63},
 {'entity': 'B-O',
  'score': 0.99941266,
  'index': 10,
  '