## parse dataset

In [2]:
from datasets import load_dataset # Import dataset import function for hugging face
dataset = load_dataset("surrey-nlp/PLOD-CW") # import the coursework dataset from

In [3]:
train_dict = dataset["train"][0:200]
train_tokens = train_dict["tokens"]
train_pos_tags = train_dict["pos_tags"]
train_ner_tags = train_dict["ner_tags"]

validation_dict = dataset["validation"]
validation_tokens = validation_dict["tokens"]
validation_pos_tags = validation_dict["pos_tags"]
validation_ner_tags = validation_dict["ner_tags"]

test_dict = dataset["test"]
test_tokens = test_dict["tokens"]
test_pos_tags = test_dict["pos_tags"]
test_ner_tags = test_dict["ner_tags"]

def data_to_lower(data:list[list[str]]) -> list[list[str]]:
    return [[token.lower() for token in tokens] for tokens in data]


train_tokens = data_to_lower(train_tokens)
validation_tokens = data_to_lower(validation_tokens)
test_tokens = data_to_lower(test_tokens)

class DataItem:
    def __init__(self, tokens, pos, ner):
        self.tokens:list[str] = tokens
        self.pos:list[str] = pos
        self.ner:list = ner

    def get_as_tuple(self) -> tuple:
        return (self.tokens, self.pos, self.ner)
    
    def get_as_tuple_list(self) -> list[tuple]:
        tuple_list = []
        for idx in range(len(self.tokens)-1):
            tuple_list.append((self.tokens[idx], self.pos[idx], self.ner[idx]))
        return tuple_list
    
    def ner_label2idx(self, label2idx_dict):
        if not isinstance(self.ner[0], str):
            print("WARNING - NER not listed as labels! NER Type: ",type(self.ner[0]),", Exiting...")
            return
        for idx, ner in enumerate(self.ner):
            ner[idx] = label2idx_dict[ner]
    
    def ner_idx2label(self, idx2label_dict):
        if not isinstance(self.ner[0], int):
            print("WARNING - NER not listed as indecies! Exiting...")
            return
        for idx, ner in enumerate(self.ner):
            ner[idx] = idx2label_dict[ner]

class DataCollection:
    def __init__(self, data_collection:list[DataItem]):
        self.data_collection:list[DataItem] = data_collection
        self.unique_tags = self.get_unique_tags()
        self.item_embeddings:dict = self.create_item_embeddings(self.unique_tags)
        self.reverse_embeddings:dict = {v:k for k,v in self.item_embeddings.items()}

    def get_token_list(self) -> list[list[str]]:
        return [data_item.tokens for data_item in self.data_collection]

    def get_pos_list(self) -> list[list[str]]:
        return [data_item.pos for data_item in self.data_collection]

    def get_ner_list(self) -> list[list[str]]:
        return [data_item.ner for data_item in self.data_collection]
    
    def get_ner_idx_list(self) -> list[list[str]]:
        ner_idx_list_collection = []
        for data_item in self.data_collection:
            ner_idx_list = []
            for ner_tag in data_item.ner:
                ner_idx_list.append(self.item_embeddings[ner_tag])
            ner_idx_list_collection.append(ner_idx_list)
        return ner_idx_list_collection

    def get_unique_tags(self) -> list[str]:
        unique_list = []
        ner_tags_list:list = self.get_ner_list()
        for ner_list in ner_tags_list:
            for ner in ner_list:
                if ner not in unique_list:
                    unique_list.append(ner)
        return unique_list
    
    def create_item_embeddings(self, tags:list[str]) -> dict:
        return {label:idx for idx, label in enumerate(tags)}

train_data:list[DataItem] = []
for idx in range(len(train_tokens)):
    train_data.append(DataItem(train_tokens[idx], train_pos_tags[idx], train_ner_tags[idx]))
train_collection:DataCollection = DataCollection(train_data)

validation_data:list[DataItem] = []
for idx in range(len(validation_tokens)):
    train_data.append(DataItem(validation_tokens[idx], validation_pos_tags[idx], validation_ner_tags[idx]))
validation_collection:DataCollection = DataCollection(validation_data)

test_data:list[DataItem] = []
for idx in range(len(test_tokens)):
    train_data.append(DataItem(test_tokens[idx], test_pos_tags[idx], test_ner_tags[idx]))
test_collection:DataCollection = DataCollection(test_data)

In [4]:
# Following https://huggingface.co/docs/transformers/main/en/tasks/token_classification

## create label2id functionality

In [5]:
def print_lists_side_by_side(a_list, b_list):
    for i in range(max(len(b_list), len(a_list))):
        try:
            print(f"{a_list[i]}\t", end="")
        except IndexError:
            print(f" \t", end="")
        try:
            print(f"{b_list[i]}\t")
        except IndexError:
            print(f" \t")

def get_unique_labels(data:list[DataItem]) -> list[str]:
    return list(set([ner for item in data for ner in item.ner]))
    # return [
    #     ner # returned item to list
    #     for item in data # original list to item
    #     for ner in item.ner # inner list to out list
    # ]

unique_labels = get_unique_labels(train_data)

def create_label_index(labels:list) -> dict:
    return {label:idx for idx, label in enumerate(labels)}

label_index = create_label_index(unique_labels)
label_index

def labels2ids(labels:list[str], label_index:dict) -> list[int]:
    id_list:list[int] = []
    for label in labels:
        if label in label_index.keys():
            id_list.append(label_index[label])
        else:
            id_list.append(None)
    return id_list

def ids2labels(ids:list[int], label_index:dict) -> list[str]:
    label_list:list[int] = []
    for id in ids:
        label_list.append(list(label_index.keys())[list(label_index.values()).index(id)])
    return label_list

print(label_index)
ner_tags_test = train_data[0].ner
ner_to_ids_test = labels2ids(train_data[0].ner, label_index)
print_lists_side_by_side(ner_tags_test, ner_to_ids_test)
ids_to_ner_test = ids2labels(ner_to_ids_test, label_index)
ids_to_ner_test

{'B-LF': 0, 'I-LF': 1, 'B-AC': 2, 'B-O': 3}
B-O	3	
B-O	3	
B-O	3	
B-O	3	
B-LF	0	
I-LF	1	
I-LF	1	
I-LF	1	
I-LF	1	
B-O	3	
B-AC	2	
B-O	3	
B-O	3	
B-O	3	
B-O	3	


['B-O',
 'B-O',
 'B-O',
 'B-O',
 'B-LF',
 'I-LF',
 'I-LF',
 'I-LF',
 'I-LF',
 'B-O',
 'B-AC',
 'B-O',
 'B-O',
 'B-O',
 'B-O']

## Import DistilBERT tokenizer

In [6]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

  _torch_pytree._register_pytree_node(


## Tokenization Example

Adds extra start and end tags (CLS and SEP), as well potentially splits one word into 2. Thus have to realign indecies.

We also have to assign -100 to CLS and SEP so they are ignored by PyTorch loss function (CrossEntropyLoss)

Only label first token of a word, add -100 for subtokens of the same word

In [7]:
example = dataset["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'for',
 'this',
 'purpose',
 'the',
 'gothenburg',
 'young',
 'persons',
 'empowerment',
 'scale',
 '(',
 'g',
 '##ype',
 '##s',
 ')',
 'was',
 'developed',
 '.',
 '[SEP]']

In [8]:
def tokenize_and_align_labels(data_collection:DataCollection):
    tokenized_inputs = tokenizer(data_collection.get_token_list(), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(data_collection.get_ner_idx_list()):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train = tokenize_and_align_labels(train_collection)
tokenized_validation = tokenize_and_align_labels(validation_collection)
tokenized_test = tokenize_and_align_labels(test_collection)

# def tokenize_and_align_labels(data_dict:dict):
#     print(data_dict["ner_tags"])
#     tokenized_inputs = tokenizer(data_dict["tokens"], truncation=True, is_split_into_words=True)

#     labels = []
#     for i, label in enumerate(data_dict[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

# tokenized_train = train_dict.map(tokenize_and_align_labels, batched=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
import evaluate
seqeval = evaluate.load("seqeval")

  _torch_pytree._register_pytree_node(


In [10]:
import numpy as np

label_list = train_collection.unique_tags
labels = train_collection.unique_tags


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

id2label = train_collection.reverse_embeddings
label2id = train_collection.item_embeddings

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(labels), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def turn_dict_to_list_of_dict(d):
    new_list = []

    for labels, inputs in zip(d["labels"], d["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

tokenised_train = turn_dict_to_list_of_dict(tokenized_train)
tokenised_val = turn_dict_to_list_of_dict(tokenized_validation)
tokenised_test = turn_dict_to_list_of_dict(tokenized_test)

In [13]:
training_args = TrainingArguments(
    output_dir="distilbert_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=['none'], # REQUIRED because otherwise keeps asking to log into "wandb"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/60 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


AttributeError: 'NoneType' object has no attribute 'get'