## Load Dataset Dictionary

In [1]:
from datasets import load_dataset
dataset = load_dataset("surrey-nlp/PLOD-CW")

##  Parse Datasets from main dataset dictionary

In [2]:
train_dataset = dataset["train"]
train_tokens = train_dataset["tokens"]
train_pos = train_dataset["pos_tags"]
train_ner = train_dataset["ner_tags"]

val_dataset = dataset["validation"]
val_tokens = val_dataset["tokens"]
val_pos = val_dataset["pos_tags"]
val_ner = val_dataset["ner_tags"]

test_dataset = dataset["test"]
test_tokens = test_dataset["tokens"]
test_pos = test_dataset["pos_tags"]
test_ner = test_dataset["ner_tags"]

# Pipeline

### Labels

In [3]:
label_list = ["B-O", "B-AC", "B-LF", "I-LF"]
label_encoding = {"B-O": 0, "B-AC": 1, "B-LF": 2, "I-LF": 3}

def ner_to_idx(ner_list):
    return [[label_encoding[ner] for ner in ner_row] for ner_row in ner_list]

train_ner_idx = ner_to_idx(train_ner)
val_ner_idx = ner_to_idx(val_ner)
test_ner_idx = ner_to_idx(test_ner)

### Tokenizer

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  _torch_pytree._register_pytree_node(


In [5]:
def tokenize_and_align_labels(dataset_tokens, dataset_ner):
    tokenized_inputs = tokenizer(dataset_tokens, truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(dataset_ner):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [6]:
def encoding_to_list(encoded):
    new_list = []

    for labels, inputs in zip(encoded["labels"], encoded["input_ids"]):
        entry = {"input_ids": inputs, "labels": labels}
        new_list.append(entry)

    return new_list

### Model

In [7]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Data Collator

In [8]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)

### Metrics

In [9]:
from datasets import load_metric

In [10]:
import numpy as np

metric = load_metric("seqeval")

def get_labels(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return true_predictions, true_labels

def get_metrics(true_predictions, true_labels):
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def compute_metrics(p):
    true_predictions, true_labels = get_labels(p)
    return get_metrics(true_predictions, true_labels)
    

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


### Training

In [11]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

In [16]:
def get_training_args(
    output_dir="bert/bert-base-uncased",
    evaluation_strategy ='epoch',
    save_strategy='epoch',
    save_total_limit = 1,
    learning_rate=2e-5,
    batch_size=16,
    num_train_epochs=1,
    weight_decay=0.001,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True
) -> TrainingArguments:
    return TrainingArguments(
        output_dir=output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs, # number of epochs to train
        weight_decay=weight_decay, # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights
        evaluation_strategy=evaluation_strategy,
        save_strategy=save_strategy,
        save_total_limit=save_total_limit,
        metric_for_best_model=metric_for_best_model,
        load_best_model_at_end=load_best_model_at_end,
        report_to=["none"]
    )

def get_trainer(
    model,
    training_args,
    train_data,
    eval_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    comp_metrics=compute_metrics
) -> Trainer:
    return Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=comp_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

### Labels to Data Frame

In [23]:
import pandas as pd

def labels_to_df(predicted_labels, true_labels):
    df = pd.DataFrame(0, columns=label_list, index=label_list) # create dataframe with only zeroes but all labels!

    for true_label, predict_label in zip(true_labels, predicted_labels):
        for t, p in zip(true_label, predict_label):
            df.at[t, p] += 1 # count amount of labels
    return df

### Pipeline

In [27]:

def run_train_and_predict(
        train_tokens,
        train_ner,
        val_tokens,
        val_ner,
        test_tokens,
        test_ner,
        training_args=None,
    ) -> pd.DataFrame:
    tokenised_train = encoding_to_list(tokenize_and_align_labels(train_tokens, train_ner))
    tokenised_val = encoding_to_list(tokenize_and_align_labels(val_tokens, val_ner))
    tokenised_test = encoding_to_list(tokenize_and_align_labels(test_tokens, test_ner))

    training_args = get_training_args() if training_args == None else training_args
    
    model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=4)

    trainer = get_trainer(
        model=model,
        training_args=training_args,
        train_data=tokenised_train,
        eval_data=tokenised_val)
    trainer.train()

    p, l, _ = trainer.predict(tokenised_test)
    # predicted_labels, true_labels = get_labels((p,l))
    return p, l


def predict_to_dataframe(p, l):
    predicted_labels, true_labels = get_labels((p,l))
    metrics = get_metrics(predicted_labels, true_labels)

    df = labels_to_df(predicted_labels, true_labels)

    print("Metrics:\n", metrics)
    print("Dataframe:\n", df)

    return df

In [28]:
p, l = run_train_and_predict(
    train_tokens,
    train_ner_idx,
    val_tokens,
    val_ner_idx,
    test_tokens,
    test_ner_idx
)

DataFrame = predict_to_dataframe(p, l)
DataFrame


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/67 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

Checkpoint destination directory bert/bert-base-uncased\checkpoint-67 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.30527928471565247, 'eval_precision': 0.898586456278764, 'eval_recall': 0.9016988289625598, 'eval_f1': 0.9001399522515847, 'eval_accuracy': 0.8857840891739197, 'eval_runtime': 0.3435, 'eval_samples_per_second': 366.776, 'eval_steps_per_second': 23.287, 'epoch': 1.0}
{'train_runtime': 8.4913, 'train_samples_per_second': 126.247, 'train_steps_per_second': 7.89, 'train_loss': 0.5440893030878323, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

Metrics:
 {'precision': 0.9004329004329005, 'recall': 0.899135446685879, 'f1': 0.8997837058399424, 'accuracy': 0.8848293489700797}
Dataframe:
        B-O  B-AC  B-LF  I-LF
B-O   5130   175     6    86
B-AC   134   410     0     3
B-LF    83     6    35   176
I-LF    89     7     1   310


Unnamed: 0,B-O,B-AC,B-LF,I-LF
B-O,5130,175,6,86
B-AC,134,410,0,3
B-LF,83,6,35,176
I-LF,89,7,1,310
