# Start Variables

In [None]:
retrain_model = True

In [1]:
from datasets import load_dataset, DatasetDict, Dataset # Import dataset import function for hugging face
dataset_dict:DatasetDict = load_dataset("surrey-nlp/PLOD-CW") # import the coursework dataset from

In [2]:
 
train_dict = dataset_dict["train"]
test_dict = dataset_dict["test"]
validation_dict = dataset_dict["validation"]

train_tokens = [row["tokens"] for row in train_dict]
train_pos_tags = [row["ner_tags"] for row in train_dict]
train_ner_tags = [row["ner_tags"] for row in train_dict]

validation_tokens = [row["tokens"] for row in validation_dict]
validation_pos_tags = [row["ner_tags"] for row in validation_dict]
validation_ner_tags = [row["ner_tags"] for row in validation_dict]

test_tokens = [row["tokens"] for row in test_dict]
test_pos_tags = [row["ner_tags"] for row in test_dict]
test_ner_tags = [row["ner_tags"] for row in test_dict]

# Train

In [3]:
def flatten_list(given_list:list[list[any]]) -> list[any]:
    return [element for inner_list in given_list for element in inner_list]

def data_to_lower(data:list[list[str]]) -> list[list[str]]:
    return [[token.lower() for token in tokens] for tokens in data]

train_tokens = data_to_lower(train_tokens)
validation_tokens = data_to_lower(validation_tokens)
test_tokens = data_to_lower(test_tokens)

class DataItem:
    def __init__(self, tokens, pos, ner, idx=0):
        self.idx=idx
        self.tokens:list[str] = tokens
        self.pos:list[str] = pos
        self.ner:list = ner

class DataCollection:
    def __init__(self, data_collection:list[DataItem], max_token_length=512):
        self.max_token_length = max_token_length
        self.data_collection:list[DataItem] = data_collection
        self.unique_tags = self.get_unique_tags()
        self.item_embeddings:dict = self.create_item_embeddings(self.unique_tags)
        self.reverse_embeddings:dict = {v:k for k,v in self.item_embeddings.items()}

    def get_token_list(self) -> list[list[str]]:
        return [data_item.tokens for data_item in self.data_collection]

    def get_pos_list(self) -> list[list[str]]:
        return [data_item.pos for data_item in self.data_collection]

    def get_ner_list(self) -> list[list[str]]:
        return [data_item.ner for data_item in self.data_collection]
    
    def get_ner_idx_list(self) -> list[list[str]]:
        ner_idx_list_collection = []
        for data_item in self.data_collection:
            ner_idx_list = []
            for ner_tag in data_item.ner:
                ner_idx_list.append(self.item_embeddings[ner_tag])
            ner_idx_list_collection.append(ner_idx_list)
        return ner_idx_list_collection

    
    def get_unique_tags(self) -> list[str]:
        return list(set(flatten_list(self.get_ner_list())))
    
    def create_item_embeddings(self, tags:list[str]) -> dict:
        return {label:idx for idx, label in enumerate(tags)}
    
    def get_tag_count(self) -> dict:
        tag_dict = {}
        for tag_list in self.get_ner_list():
            for tag in tag_list:
                if tag not in tag_dict.keys():
                    tag_dict[tag] = 1
                else:
                    tag_dict[tag] += 1
        return tag_dict

def dataset_to_collection(dataset) -> DataCollection:
    data_items:list[DataItem] = []
    for idx in range(len(dataset)):
        data_items.append(DataItem(dataset["tokens"][idx], dataset["pos_tags"][idx], dataset["ner_tags"][idx], idx))
    return DataCollection(data_items)

train_data:list[DataItem] = []
for idx in range(len(train_tokens)):
    train_data.append(DataItem(train_tokens[idx], train_pos_tags[idx], train_ner_tags[idx], idx))
train_collection:DataCollection = DataCollection(train_data)

validation_data:list[DataItem] = []
for idx in range(len(validation_tokens)):
    validation_data.append(DataItem(validation_tokens[idx], validation_pos_tags[idx], validation_ner_tags[idx], idx))
validation_collection:DataCollection = DataCollection(validation_data)

test_data:list[DataItem] = []
for idx in range(len(test_tokens)):
    test_data.append(DataItem(test_tokens[idx], test_pos_tags[idx], test_ner_tags[idx], idx))
test_collection:DataCollection = DataCollection(test_data)

In [6]:
# python
import os
import pandas as pd
import numpy as np
import string  

# spacy
import spacy
from spacy.tokens import Span
from spacy.language import Language
from spacy.tokens import Doc, DocBin
nlp:Language = spacy.blank("en")

# configuration
def create_dir(dir_path):
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

working_dir:str = os.getcwd()
vocab_dir:str = os.path.join(working_dir, "spacy_vocab")
create_dir(vocab_dir)

config_dir:str = os.path.join(working_dir, "config")
create_dir(config_dir)

output_dir:str = os.path.join(working_dir, "output")
create_dir(output_dir)

train_vocab_path = os.path.join(vocab_dir, "train.spacy")
dev_vocab_path = os.path.join(vocab_dir, "dev.spacy")
config_path = os.path.join(config_dir, "config.cfg")


In [5]:
train_docbin:DocBin = DocBin()
test_docbin:DocBin = DocBin()

def dataset_to_vocab(collection:DataCollection, doc_bin:DocBin) -> dict:
    for data_item in collection.data_collection:
        spaces = [True if token not in string.punctuation else False for token in data_item.tokens] 
        doc = Doc(nlp.vocab, words=data_item.tokens, spaces=spaces, ents=data_item.ner)
        doc_bin.add(doc)

dataset_to_vocab(train_collection, train_docbin)
dataset_to_vocab(test_collection, test_docbin)

train_docbin.to_disk(train_vocab_path)
test_docbin.to_disk(dev_vocab_path)

In [None]:
import time 
spacy_train_time = ""
if retrain_model:
    from spacy.cli.train import train
    training_start_time = time.time()
    train(config_path=config_path, output_path=output_dir, overrides={"paths.train": train_vocab_path, "paths.dev": dev_vocab_path}, use_gpu=0)
    spacy_train_time = '{:.2f}s'.format(time.time() - training_start_time)
print("Training finished with time: ", spacy_train_time)


[38;5;4mℹ Saving to output directory:
c:\Users\olive\OneDrive\Documents\Uni\TEST - FYP -
SpacyEntityFinder_V3\spacy\output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m


  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        1644.56   1143.95    6.87    5.69    8.66    0.07
 40     200       60542.23  64085.80   87.25   86.79   87.73    0.87
 80     400        1498.42   1508.59   86.49   86.33   86.64    0.86
120     600         235.75    258.09   85.20   85.20   85.20    0.85
160     800         605.29    468.50   86.64   86.64   86.64    0.87
200    1000         105.87    104.10   86.49   86.33   86.64    0.86
240    1200          34.14     29.90   86.13   85.97   86.28    0.86
280    1400          74.09     57.51   86.23   86.55   85.92    0.86
320    1600          43.12     33.95   87.00   87.00   87.00    0.87
360    1800          59.35     46.71   85.10   84.64   85.56    0.85
[38;5;2m✔ Saved pipeline to output director