# Start Variables

In [25]:
import pandas as pd
import numpy as np

In [3]:
from datasets import load_dataset, DatasetDict, Dataset # Import dataset import function for hugging face
dataset_dict:DatasetDict = load_dataset("surrey-nlp/PLOD-CW") # import the coursework dataset from

# Use JSON Data

In [4]:
train_dict = dataset_dict["train"]
test_dict = dataset_dict["test"]
validation_dict = dataset_dict["validation"]

train_tokens = [row["tokens"] for row in train_dict]
train_pos_tags = [row["ner_tags"] for row in train_dict]
train_ner_tags = [row["ner_tags"] for row in train_dict]

validation_tokens = [row["tokens"] for row in validation_dict]
validation_pos_tags = [row["ner_tags"] for row in validation_dict]
validation_ner_tags = [row["ner_tags"] for row in validation_dict]

test_tokens = [row["tokens"] for row in test_dict]
test_pos_tags = [row["ner_tags"] for row in test_dict]
test_ner_tags = [row["ner_tags"] for row in test_dict]

# Parse Data

## List Manipulation

In [5]:

# lower all characters in a list more simplistically
def data_to_lower(data:list[list[str]]) -> list[list[str]]:
    return [[token.lower() for token in tokens] for tokens in data]

# to easily flatten our data rows into a single list
def flatten_list(given_list:list[list[any]]) -> list[any]:
    return [element for inner_list in given_list for element in inner_list]

# get all unique values in a list, can be used to get also classes
def get_unique_tags(tag_list:list[list[str]]) -> list[str]:
    return list(set(flatten_list(tag_list)))

In [6]:
# Lower case all tokens
train_tokens_lower = data_to_lower(train_tokens)
validation_tokens_lower = data_to_lower(validation_tokens)
test_tokens_lower = data_to_lower(test_tokens)

## Data Items and Collections

In [7]:
# The Data Item is a row item
# holds an optional row index (0 if none given) and a list of tokens
# as well as pos and ner tags
class DataRow:
    def __init__(self, tokens, pos, ner, row_idx=0):
        self.idx:int = row_idx
        self.tokens:list[str] = tokens
        self.pos:list[str] = pos
        self.ner:list = ner

# The Data collection is the collection of rows
# as well as unique IDs and label (NER) embeddings

class DataCollection:
    def __init__(self, list_collection:list[DataRow], max_token_length=512):
        self.max_token_length:int = max_token_length # max token length (if we tokenize inputs)
        self.collection:list[DataRow] = list_collection # list of rows in the collection
        self.unique_ner_tags:list[str] = []
        self.ner_label2idx:dict = {}
        self.ner_idx2label:dict = {}
        self.ner_as_idx:list[list[int]] = []

    # get a list of token rows
    def get_token_list(self) -> list[list[str]]:
        return [data_item.tokens for data_item in self.collection]

    # get a list of pos rows
    def get_pos_list(self) -> list[list[str]]:
        return [data_item.pos for data_item in self.collection]

    # get a list of ner rows
    def get_ner_list(self) -> list[list[str]]:
        return [data_item.ner for data_item in self.collection]
    
    # turn the ner str list to integer list (embeddings for tokenisation)
    def get_ner_embeddings_list(self, collection:list[DataRow], embeddings:dict) -> list[list[int]]:
        ner_idx_list_collection:list[list[int]] = []
        for data_item in collection:
            ner_idx_list = []
            for ner_tag in data_item.ner:
                ner_idx_list.append(embeddings[ner_tag])
            ner_idx_list_collection.append(ner_idx_list)
        return ner_idx_list_collection
    
    def set_unique_ner_tags(self, tags:list[str]) -> None:
        self.unique_ner_tags = tags
        self.__set_ner_label2idx__(self.unique_ner_tags)
        self.__set_ner_idx2label__(self.ner_label2idx)
        self.__set_ner_as_idx__(self.ner_label2idx)

    def __set_ner_label2idx__(self, tags:list[str]) -> None:
        self.ner_label2idx:dict = {tag:idx for idx, tag in enumerate(tags)}

    def __set_ner_idx2label__(self, embeddings:dict) -> None:
        self.ner_idx2label:dict = {v:k for k, v in embeddings.items()}
    
    def __set_ner_as_idx__(self, embeddings:dict) -> None:
        self.ner_as_idx:list[list[int]] = self.get_ner_embeddings_list(self.collection, embeddings)

In [8]:
# Unique label order
# IMPORTANT - This should not change
# as when the model is trained it uses this for ordering!
tag_list = ["B-O", "B-AC", "B-LF", "I-LF"]

In [9]:
def data_to_collection(token_list:list[list[str]], pos_list:list[list[str]], ner_list:list[list[str]]) -> DataCollection:
    data_items:list[DataRow] = []
    for idx in range(len(token_list)):
        data_items.append(DataRow(token_list[idx], pos_list[idx], ner_list[idx], idx))
    collection = DataCollection(data_items)
    collection.set_unique_ner_tags(tag_list)
    return collection

train_collection = data_to_collection(train_tokens_lower, train_pos_tags, train_ner_tags)
validation_collection = data_to_collection(validation_tokens_lower, validation_pos_tags, validation_ner_tags)
test_collection = data_to_collection(test_tokens_lower, test_pos_tags, test_ner_tags)

# set all collections to have the same tag list.
# this will avoid metric issues down the line!
train_collection.set_unique_ner_tags(tag_list)
validation_collection.set_unique_ner_tags(tag_list)
test_collection.set_unique_ner_tags(tag_list)

## BERT Tokenisation

In [10]:
from transformers import AutoTokenizer
from transformers import BatchEncoding
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

  _torch_pytree._register_pytree_node(


In [11]:
tokenized_input:BatchEncoding = tokenizer(train_collection.get_token_list()[0], is_split_into_words=True)
tokenized_words = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

In [12]:
def tokenize_and_align_labels(data_collection:DataCollection) -> BatchEncoding:
    tokenized_inputs = tokenizer(data_collection.get_token_list(), truncation=True, is_split_into_words=True, max_length=512) # tokenise inputs

    labels = [] # create empty labels list to later matchs with tokenised inputs

    for i, label in enumerate(data_collection.ner_as_idx): # enumerate ner tags that we have converted to 
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # get word ids
        previous_word_idx = None # previous word index to check if same
        label_ids = [] # create current label ids list
        for word_idx in word_ids:  # for each index
            if word_idx is None:  # if index is none must be special token
                label_ids.append(-100) # append -100
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx]) # if does not equal previous word idx, append the label
            else:
                label_ids.append(-100) # if it does, the word has split so we add -100 again
            previous_word_idx = word_idx # set the current index as previous index for next check
        labels.append(label_ids) # on all processed, add to labels list

    tokenized_inputs["labels"] = labels # add to dictionary, will be input_ids, labels and attention mask
    return tokenized_inputs

In [13]:
def batch_list(batch:BatchEncoding):
    return [{"input_ids": inputs, "labels": labels} for labels, inputs in zip(batch["labels"], batch["input_ids"])]

In [14]:
from datasets import load_metric
import evaluate
seqeval = evaluate.load("seqeval")
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [tag_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [tag_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [15]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def get_training_args(
        out_dir:str,
        learning_rate:float=2e-5,
        batch_size:int=16,
        epochs:int=2,
        weight_decay:float=0.01,
        evaluation_strategy:str="epoch",
        save_strategy:str="epoch",
        lr_scheduler_type="linear") -> TrainingArguments:
    return TrainingArguments(
        output_dir=out_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        lr_scheduler_type=lr_scheduler_type,
        num_train_epochs=epochs, # number of epochs to train, can be overriden by max steps
        weight_decay=weight_decay, # The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights
        evaluation_strategy=evaluation_strategy, # evaluate at the end of each epoch
        save_strategy=save_strategy, # can save by epoch, steps or not at all
        save_total_limit=1, # how many checkpoints to keep before overriding (set to 1, so latest checkpoint is only kept)!
        load_best_model_at_end=True,
        report_to=['none'], # REQUIRED because otherwise keeps asking to log into "wandb",
        overwrite_output_dir=True,
    )

def get_trainer(model, training_args:TrainingArguments, tokenised_train:list[dict], tokenised_eval:list[dict]) -> Trainer:
    return Trainer(
        model=model, # model we use for training
        args=training_args, # model arguments
        train_dataset=tokenised_train, # train dataset tokenised
        eval_dataset=tokenised_eval, # testing dataset tokenised for model training evaluation
        tokenizer=tokenizer, # which tokeniser are we using
        data_collator=data_collator, # data collector pads the tokens along with the labels
        compute_metrics=compute_metrics, # function to compute metrics on how well we are scoring
    )

# Accuracy Calculations

In [16]:
def get_truth_values(given_trainer:Trainer, validation_data_collection:DataCollection, validation_tokenised_list:list[dict], given_label_list):
    given_predictions, given_labels, _ = given_trainer.predict(validation_tokenised_list)
    true_predictions = []
    true_labels = []

    given_predictions = np.argmax(given_predictions, axis=2)
    for prediction, label in zip(given_predictions, given_labels):
        for (pred_val, label_val) in zip(prediction, label):
            if label_val != -100: # label that is not supposed to be looked at!
                true_predictions.append(given_label_list[pred_val])
                true_labels.append(given_label_list[label_val])
                
    return true_predictions, true_labels

In [18]:
def trained_model_to_dataframe(trained_model:Trainer, validation_data_collection:DataCollection, validation_tokenised_in:list[dict], label_list_in:dict) -> pd.DataFrame:
    truth_predicitons, truth_labels = get_truth_values(trained_model, validation_data_collection, validation_tokenised_in, label_list_in)

    df = pd.DataFrame(0, columns=label_list_in, index=label_list_in) # create dataframe with only zeroes but all labels!

    for true_label, predict_label in zip(truth_labels, truth_predicitons):
        df.at[true_label, predict_label] += 1 # count amount of labels
    return df


In [19]:
def calc_precision(TP, FP) -> float:
    return TP / (TP + FP)

def calc_recall(TP, FN) -> float:
    return TP / (TP + FN)

def calc_f1_score(precision, recall) -> float:
    return 2 * ((precision * recall) / (precision + recall))

def calculate_metrics(data_frame:pd.DataFrame) -> dict:
    metric_dict:dict = {}
    df_labels = list(data_frame.index)
    for label in list(data_frame.index):
        # initialise matrix values to 0
        TP:int = 0; TN:int = 0; FP:int = 0; FN:int = 0

        # true positive is label itself
        TP = data_frame.at[label,label]

        # create a copy of all the other labels that is not itself
        index_labels = df_labels.copy()
        index_labels.remove(label)

        # calculate other matrix values
        for df_idx in index_labels:
            FN += data_frame.at[label,df_idx] # FN is values in the row that is not the current label
            FP += data_frame.at[df_idx,label] # FP is values in the column that is not current label
            for in_df_idx in index_labels:
                # TN is all the other labels that are not in the row and col
                # of the dataframe
                TN += data_frame.at[df_idx, in_df_idx] 

        # calculate score per label
        prec = calc_precision(TP, FP)
        rec = calc_recall(TP, FN)
        f1 = calc_f1_score(prec, rec)
        metric_dict[label] =  {
            "precision":prec,
            "recall":rec,
            "f1_score":f1
        }
    return metric_dict

In [20]:
from sklearn.metrics import ConfusionMatrixDisplay
from matplotlib import pyplot as plt

def plot_dataframe(dataframe:pd.DataFrame, normalised:bool=True) -> ConfusionMatrixDisplay:
    if normalised:
        result_df = dataframe.div(dataframe.sum(axis=1), axis=0)
    else:
        result_df = dataframe

    disp = ConfusionMatrixDisplay(confusion_matrix=result_df.to_numpy(), display_labels=list(result_df.index)) 
    return disp

def plot_and_metric(plot_df, normalise=True, name="") -> tuple[ConfusionMatrixDisplay, dict]:
    disp = plot_dataframe(plot_df, normalise)
    metrics = calculate_metrics(plot_df)
    return disp, metrics

# Train Pipeline

### Pipeline

In [21]:
def combine_rows_columns(df):
    new_labels = {label: label.split('-')[-1] if '-' in label else label for label in df.columns}
    new_df = pd.DataFrame(index=list(set(new_labels.values())), columns=list(set(new_labels.values())))

    for new_row in new_df.index:
        for new_col in new_df.columns:
            rows_to_sum = [old_row for old_row, new in new_labels.items() if new == new_row]
            cols_to_sum = [old_col for old_col, new in new_labels.items() if new == new_col]
            new_df.loc[new_row, new_col] = df.loc[rows_to_sum, cols_to_sum].values.sum()

    return new_df.astype(float)

In [27]:
import os
import shutil
import time

working_dir = os.getcwd()
def get_exp_checkpoint(exp_path:str):
    exp_dir:str = os.path.join(working_dir, exp_path)
    checkpoint_name:str = os.listdir(exp_dir)[0] # selects first checkpoint name found
    return os.path.join(exp_dir, checkpoint_name)

def delete_previous_checkpoints(exp_name:str) -> None:
    # delete all previous checkpoint to make sure we are not keeping anything if retraining
    exp_dir:str = os.path.join(working_dir, exp_name)
    if os.path.exists(exp_dir):
        checkpoint_name:str = os.listdir(exp_dir)[0]
        checkpoint_path:str = os.path.join(exp_dir, checkpoint_name)
        if os.path.exists(checkpoint_path):
            shutil.rmtree(checkpoint_path)

def run_train_and_plot_pipeline(
    train_data_collection:DataCollection,
    test_data_collection:DataCollection=test_collection,
    validation_data_collection:DataCollection=validation_collection,
    retrain_model:bool=True,
    train_model_checkpoint_name:str="distilbert-base-uncased", # if we re-train use this 
    exp_path:str="exp_test", # otherwise to load data, will use this
    epochs:int=2,
    training_args:TrainingArguments=None # override if wanted
) -> tuple[ConfusionMatrixDisplay, dict, str, dict]: 
    label_list:list[str] = train_data_collection.unique_ner_tags
    id2label:dict = train_data_collection.ner_idx2label
    label2id:dict = train_data_collection.ner_label2idx
    if retrain_model:
        delete_previous_checkpoints(exp_path)
        model = AutoModelForTokenClassification.from_pretrained(
            train_model_checkpoint_name, 
            num_labels=len(label_list), 
            id2label=id2label, 
            label2id=label2id)
    else:
        model =  AutoModelForTokenClassification.from_pretrained(
            get_exp_checkpoint(exp_path), 
            num_labels=len(label_list), 
            id2label=id2label, 
            label2id=label2id)
        
    train_tokenised = batch_list(tokenize_and_align_labels(train_data_collection))
    test_tokenised = batch_list(tokenize_and_align_labels(test_data_collection))
    validation_tokenised = batch_list(tokenize_and_align_labels(validation_data_collection))

    training_args = get_training_args(out_dir=exp_path, epochs=epochs) if training_args == None else training_args
    trainer:Trainer = get_trainer(
        model,
        training_args, 
        train_tokenised, 
        test_tokenised)
    
    train_metric = {
        "retrained":retrain_model,
        "train_time":"",
        "epochs":epochs
    }
    if retrain_model:
        training_start_time = time.time()
        trainer.train()
        train_metric["train_time"] = '{:.2f}s'.format(time.time() - training_start_time)
        
    pipeline_df:pd.DataFrame = trained_model_to_dataframe(trainer, validation_data_collection, validation_tokenised, train_data_collection.unique_ner_tags)
    pipeline_df = combine_rows_columns(pipeline_df)
    disp, metric_dict = plot_and_metric(pipeline_df, name=exp_path)

    return disp, metric_dict, exp_path, train_metric
    



## Training

In [28]:
# Train Settings!
epoch_to_train:int = 50
retrain_model = True 

In [29]:
run_train_and_plot_pipeline(train_collection, retrain_model=retrain_model, exp_path="checkpoints/distilbert-uncased-best", epochs=epoch_to_train)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/3350 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.18310260772705078, 'eval_precision': 0.935179358086847, 'eval_recall': 0.9456936784047518, 'eval_f1': 0.9404071300495728, 'eval_accuracy': 0.9354, 'eval_runtime': 0.2425, 'eval_samples_per_second': 630.814, 'eval_steps_per_second': 41.23, 'epoch': 1.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.15890218317508698, 'eval_precision': 0.9503410059676044, 'eval_recall': 0.9459058124734833, 'eval_f1': 0.9481182224112269, 'eval_accuracy': 0.943, 'eval_runtime': 0.2027, 'eval_samples_per_second': 754.913, 'eval_steps_per_second': 49.341, 'epoch': 2.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.16473789513111115, 'eval_precision': 0.9523402436418038, 'eval_recall': 0.945269410267289, 'eval_f1': 0.9487916533588843, 'eval_accuracy': 0.9442, 'eval_runtime': 0.1907, 'eval_samples_per_second': 802.171, 'eval_steps_per_second': 52.429, 'epoch': 3.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.15695497393608093, 'eval_precision': 0.9525430942753778, 'eval_recall': 0.9495120916419177, 'eval_f1': 0.9510251779453947, 'eval_accuracy': 0.9464, 'eval_runtime': 0.2007, 'eval_samples_per_second': 762.291, 'eval_steps_per_second': 49.823, 'epoch': 4.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.1702721267938614, 'eval_precision': 0.953662182361734, 'eval_recall': 0.9473907509546033, 'eval_f1': 0.9505161221666489, 'eval_accuracy': 0.946, 'eval_runtime': 0.1964, 'eval_samples_per_second': 779.08, 'eval_steps_per_second': 50.92, 'epoch': 5.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.1780245155096054, 'eval_precision': 0.9536423841059603, 'eval_recall': 0.9469664828171405, 'eval_f1': 0.9502927088877062, 'eval_accuracy': 0.9458, 'eval_runtime': 0.203, 'eval_samples_per_second': 753.673, 'eval_steps_per_second': 49.26, 'epoch': 6.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.1950204223394394, 'eval_precision': 0.9557219251336898, 'eval_recall': 0.9478150190920662, 'eval_f1': 0.9517520502715944, 'eval_accuracy': 0.9478, 'eval_runtime': 0.2098, 'eval_samples_per_second': 729.221, 'eval_steps_per_second': 47.662, 'epoch': 7.0}
{'loss': 0.1584, 'learning_rate': 1.701492537313433e-05, 'epoch': 7.46}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.20544955134391785, 'eval_precision': 0.9533844189016603, 'eval_recall': 0.950148493848112, 'eval_f1': 0.9517637059073523, 'eval_accuracy': 0.9472, 'eval_runtime': 0.1907, 'eval_samples_per_second': 802.352, 'eval_steps_per_second': 52.441, 'epoch': 8.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2155715823173523, 'eval_precision': 0.9539766702014846, 'eval_recall': 0.9541790411540093, 'eval_f1': 0.9540778449464418, 'eval_accuracy': 0.9494, 'eval_runtime': 0.2033, 'eval_samples_per_second': 752.704, 'eval_steps_per_second': 49.196, 'epoch': 9.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.22702959179878235, 'eval_precision': 0.9495315161839863, 'eval_recall': 0.9459058124734833, 'eval_f1': 0.9477151965993624, 'eval_accuracy': 0.9436, 'eval_runtime': 0.2012, 'eval_samples_per_second': 760.352, 'eval_steps_per_second': 49.696, 'epoch': 10.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.23576220870018005, 'eval_precision': 0.9542832728049562, 'eval_recall': 0.9476028850233348, 'eval_f1': 0.9509313464608834, 'eval_accuracy': 0.9464, 'eval_runtime': 0.2, 'eval_samples_per_second': 765.034, 'eval_steps_per_second': 50.002, 'epoch': 11.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2503521144390106, 'eval_precision': 0.9530685920577617, 'eval_recall': 0.9520577004666949, 'eval_f1': 0.9525628780643106, 'eval_accuracy': 0.9478, 'eval_runtime': 0.2035, 'eval_samples_per_second': 751.785, 'eval_steps_per_second': 49.136, 'epoch': 12.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2583923935890198, 'eval_precision': 0.9537116040955631, 'eval_recall': 0.9484514212982605, 'eval_f1': 0.9510742395235057, 'eval_accuracy': 0.947, 'eval_runtime': 0.1955, 'eval_samples_per_second': 782.802, 'eval_steps_per_second': 51.164, 'epoch': 13.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2647983431816101, 'eval_precision': 0.9490701606086221, 'eval_recall': 0.9526941026728892, 'eval_f1': 0.9508786788058439, 'eval_accuracy': 0.9472, 'eval_runtime': 0.1962, 'eval_samples_per_second': 779.851, 'eval_steps_per_second': 50.971, 'epoch': 14.0}
{'loss': 0.0224, 'learning_rate': 1.4029850746268658e-05, 'epoch': 14.93}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.28858834505081177, 'eval_precision': 0.9521895493970806, 'eval_recall': 0.9548154433602036, 'eval_f1': 0.9535006884863891, 'eval_accuracy': 0.948, 'eval_runtime': 0.2108, 'eval_samples_per_second': 725.84, 'eval_steps_per_second': 47.441, 'epoch': 15.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2884122133255005, 'eval_precision': 0.9559855411439506, 'eval_recall': 0.9537547730165464, 'eval_f1': 0.9548688541998513, 'eval_accuracy': 0.9506, 'eval_runtime': 0.2015, 'eval_samples_per_second': 759.323, 'eval_steps_per_second': 49.629, 'epoch': 16.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3035992980003357, 'eval_precision': 0.9524114383269313, 'eval_recall': 0.946754348748409, 'eval_f1': 0.9495744680851064, 'eval_accuracy': 0.9448, 'eval_runtime': 0.2213, 'eval_samples_per_second': 691.37, 'eval_steps_per_second': 45.188, 'epoch': 17.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.2921501100063324, 'eval_precision': 0.9534141671984684, 'eval_recall': 0.9507848960543063, 'eval_f1': 0.952097716409984, 'eval_accuracy': 0.9476, 'eval_runtime': 0.2063, 'eval_samples_per_second': 741.682, 'eval_steps_per_second': 48.476, 'epoch': 18.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.31535637378692627, 'eval_precision': 0.9549530315969257, 'eval_recall': 0.9488756894357234, 'eval_f1': 0.9519046605660779, 'eval_accuracy': 0.9472, 'eval_runtime': 0.2084, 'eval_samples_per_second': 734.189, 'eval_steps_per_second': 47.986, 'epoch': 19.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3079887628555298, 'eval_precision': 0.953656462585034, 'eval_recall': 0.951633432329232, 'eval_f1': 0.95264387343385, 'eval_accuracy': 0.9478, 'eval_runtime': 0.2151, 'eval_samples_per_second': 711.329, 'eval_steps_per_second': 46.492, 'epoch': 20.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3035695254802704, 'eval_precision': 0.9540523292916401, 'eval_recall': 0.9514212982605006, 'eval_f1': 0.9527349973446627, 'eval_accuracy': 0.948, 'eval_runtime': 0.2008, 'eval_samples_per_second': 761.958, 'eval_steps_per_second': 49.801, 'epoch': 21.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.311545729637146, 'eval_precision': 0.9541011474713132, 'eval_recall': 0.9524819686041578, 'eval_f1': 0.9532908704883228, 'eval_accuracy': 0.9484, 'eval_runtime': 0.2103, 'eval_samples_per_second': 727.431, 'eval_steps_per_second': 47.544, 'epoch': 22.0}
{'loss': 0.0059, 'learning_rate': 1.1044776119402986e-05, 'epoch': 22.39}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3153226375579834, 'eval_precision': 0.9542261017670853, 'eval_recall': 0.9507848960543063, 'eval_f1': 0.952502390819254, 'eval_accuracy': 0.9478, 'eval_runtime': 0.212, 'eval_samples_per_second': 721.563, 'eval_steps_per_second': 47.161, 'epoch': 23.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.313147634267807, 'eval_precision': 0.9553571428571429, 'eval_recall': 0.9533305048790836, 'eval_f1': 0.9543427479294967, 'eval_accuracy': 0.9496, 'eval_runtime': 0.2316, 'eval_samples_per_second': 660.725, 'eval_steps_per_second': 43.185, 'epoch': 24.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.31679144501686096, 'eval_precision': 0.9549032120825356, 'eval_recall': 0.9522698345354264, 'eval_f1': 0.9535847052575677, 'eval_accuracy': 0.9484, 'eval_runtime': 0.2155, 'eval_samples_per_second': 710.064, 'eval_steps_per_second': 46.409, 'epoch': 25.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3251962959766388, 'eval_precision': 0.956503198294243, 'eval_recall': 0.951633432329232, 'eval_f1': 0.9540621012335176, 'eval_accuracy': 0.949, 'eval_runtime': 0.2164, 'eval_samples_per_second': 706.951, 'eval_steps_per_second': 46.206, 'epoch': 26.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3283710479736328, 'eval_precision': 0.9550394204133816, 'eval_recall': 0.9507848960543063, 'eval_f1': 0.9529074093759966, 'eval_accuracy': 0.9482, 'eval_runtime': 0.2181, 'eval_samples_per_second': 701.641, 'eval_steps_per_second': 45.859, 'epoch': 27.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3291175663471222, 'eval_precision': 0.954265049989364, 'eval_recall': 0.951633432329232, 'eval_f1': 0.952947424322889, 'eval_accuracy': 0.9486, 'eval_runtime': 0.23, 'eval_samples_per_second': 665.346, 'eval_steps_per_second': 43.487, 'epoch': 28.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3249143362045288, 'eval_precision': 0.9537547730165464, 'eval_recall': 0.9537547730165464, 'eval_f1': 0.9537547730165464, 'eval_accuracy': 0.9486, 'eval_runtime': 0.2182, 'eval_samples_per_second': 701.088, 'eval_steps_per_second': 45.823, 'epoch': 29.0}
{'loss': 0.0031, 'learning_rate': 8.059701492537314e-06, 'epoch': 29.85}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3295745849609375, 'eval_precision': 0.9547770700636943, 'eval_recall': 0.9539669070852779, 'eval_f1': 0.95437181663837, 'eval_accuracy': 0.9494, 'eval_runtime': 0.2148, 'eval_samples_per_second': 712.411, 'eval_steps_per_second': 46.563, 'epoch': 30.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.34031638503074646, 'eval_precision': 0.9529687167482443, 'eval_recall': 0.9499363597793806, 'eval_f1': 0.9514501221714651, 'eval_accuracy': 0.9472, 'eval_runtime': 0.2256, 'eval_samples_per_second': 678.155, 'eval_steps_per_second': 44.324, 'epoch': 31.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3531687557697296, 'eval_precision': 0.9545842217484009, 'eval_recall': 0.9497242257106492, 'eval_f1': 0.9521480221182476, 'eval_accuracy': 0.9474, 'eval_runtime': 0.2188, 'eval_samples_per_second': 699.421, 'eval_steps_per_second': 45.714, 'epoch': 32.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.35065627098083496, 'eval_precision': 0.9534537725823592, 'eval_recall': 0.951633432329232, 'eval_f1': 0.9525427327741798, 'eval_accuracy': 0.9466, 'eval_runtime': 0.2379, 'eval_samples_per_second': 643.174, 'eval_steps_per_second': 42.038, 'epoch': 33.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3490392863750458, 'eval_precision': 0.9550106609808102, 'eval_recall': 0.950148493848112, 'eval_f1': 0.9525733730327519, 'eval_accuracy': 0.947, 'eval_runtime': 0.222, 'eval_samples_per_second': 689.108, 'eval_steps_per_second': 45.04, 'epoch': 34.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.34184518456459045, 'eval_precision': 0.9563272262462719, 'eval_recall': 0.9522698345354264, 'eval_f1': 0.9542942176870749, 'eval_accuracy': 0.9492, 'eval_runtime': 0.2226, 'eval_samples_per_second': 687.467, 'eval_steps_per_second': 44.932, 'epoch': 35.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.35368612408638, 'eval_precision': 0.9549434123425155, 'eval_recall': 0.948663555366992, 'eval_f1': 0.951793125465574, 'eval_accuracy': 0.947, 'eval_runtime': 0.2191, 'eval_samples_per_second': 698.38, 'eval_steps_per_second': 45.646, 'epoch': 36.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3446611762046814, 'eval_precision': 0.9565124706885525, 'eval_recall': 0.9518455663979635, 'eval_f1': 0.9541733120680489, 'eval_accuracy': 0.9486, 'eval_runtime': 0.2282, 'eval_samples_per_second': 670.463, 'eval_steps_per_second': 43.821, 'epoch': 37.0}
{'loss': 0.0016, 'learning_rate': 5.074626865671642e-06, 'epoch': 37.31}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3472658097743988, 'eval_precision': 0.9558917536756872, 'eval_recall': 0.951633432329232, 'eval_f1': 0.9537578399064527, 'eval_accuracy': 0.949, 'eval_runtime': 0.2283, 'eval_samples_per_second': 670.179, 'eval_steps_per_second': 43.803, 'epoch': 38.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.34810274839401245, 'eval_precision': 0.9567348678601876, 'eval_recall': 0.9522698345354264, 'eval_f1': 0.9544971294918136, 'eval_accuracy': 0.9494, 'eval_runtime': 0.2176, 'eval_samples_per_second': 703.034, 'eval_steps_per_second': 45.95, 'epoch': 39.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3485768735408783, 'eval_precision': 0.9575783415050096, 'eval_recall': 0.9529062367416207, 'eval_f1': 0.9552365762892079, 'eval_accuracy': 0.9502, 'eval_runtime': 0.2192, 'eval_samples_per_second': 697.993, 'eval_steps_per_second': 45.62, 'epoch': 40.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.35209789872169495, 'eval_precision': 0.9567071870334826, 'eval_recall': 0.951633432329232, 'eval_f1': 0.9541635648197383, 'eval_accuracy': 0.9486, 'eval_runtime': 0.2232, 'eval_samples_per_second': 685.589, 'eval_steps_per_second': 44.81, 'epoch': 41.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3542582094669342, 'eval_precision': 0.9540913921360255, 'eval_recall': 0.9522698345354264, 'eval_f1': 0.953179743072513, 'eval_accuracy': 0.9478, 'eval_runtime': 0.2156, 'eval_samples_per_second': 709.586, 'eval_steps_per_second': 46.378, 'epoch': 42.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3594226837158203, 'eval_precision': 0.9538199616939774, 'eval_recall': 0.9507848960543063, 'eval_f1': 0.9523000106236056, 'eval_accuracy': 0.9472, 'eval_runtime': 0.2249, 'eval_samples_per_second': 680.297, 'eval_steps_per_second': 44.464, 'epoch': 43.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.35702216625213623, 'eval_precision': 0.9554180887372014, 'eval_recall': 0.950148493848112, 'eval_f1': 0.9527760051052968, 'eval_accuracy': 0.9472, 'eval_runtime': 0.225, 'eval_samples_per_second': 680.054, 'eval_steps_per_second': 44.448, 'epoch': 44.0}
{'loss': 0.0012, 'learning_rate': 2.08955223880597e-06, 'epoch': 44.78}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3585200011730194, 'eval_precision': 0.9548167092924126, 'eval_recall': 0.9503606279168434, 'eval_f1': 0.9525834573676377, 'eval_accuracy': 0.947, 'eval_runtime': 0.2221, 'eval_samples_per_second': 689.029, 'eval_steps_per_second': 45.035, 'epoch': 45.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3564692437648773, 'eval_precision': 0.9552620366425224, 'eval_recall': 0.9512091641917692, 'eval_f1': 0.9532312925170068, 'eval_accuracy': 0.9476, 'eval_runtime': 0.2219, 'eval_samples_per_second': 689.625, 'eval_steps_per_second': 45.074, 'epoch': 46.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3562890291213989, 'eval_precision': 0.9546712066397106, 'eval_recall': 0.951633432329232, 'eval_f1': 0.9531498990757463, 'eval_accuracy': 0.948, 'eval_runtime': 0.2257, 'eval_samples_per_second': 677.949, 'eval_steps_per_second': 44.31, 'epoch': 47.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.35917773842811584, 'eval_precision': 0.9548359608010226, 'eval_recall': 0.9507848960543063, 'eval_f1': 0.9528061224489797, 'eval_accuracy': 0.9472, 'eval_runtime': 0.2232, 'eval_samples_per_second': 685.339, 'eval_steps_per_second': 44.793, 'epoch': 48.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3588024973869324, 'eval_precision': 0.9544390036193315, 'eval_recall': 0.9509970301230377, 'eval_f1': 0.952714908086282, 'eval_accuracy': 0.9476, 'eval_runtime': 0.2251, 'eval_samples_per_second': 679.658, 'eval_steps_per_second': 44.422, 'epoch': 49.0}


  0%|          | 0/10 [00:00<?, ?it/s]

{'eval_loss': 0.3581068217754364, 'eval_precision': 0.9542066027689031, 'eval_recall': 0.9503606279168434, 'eval_f1': 0.9522797321713253, 'eval_accuracy': 0.9472, 'eval_runtime': 0.2268, 'eval_samples_per_second': 674.46, 'eval_steps_per_second': 44.082, 'epoch': 50.0}
{'train_runtime': 225.5762, 'train_samples_per_second': 237.614, 'train_steps_per_second': 14.851, 'train_loss': 0.028842600201492877, 'epoch': 50.0}


  0%|          | 0/8 [00:00<?, ?it/s]

(<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x2149b76d8d0>,
 {'O': {'precision': 0.974044360547428,
   'recall': 0.9687866697958226,
   'f1_score': 0.9714084009883517},
  'AC': {'precision': 0.8212927756653993,
   'recall': 0.8212927756653993,
   'f1_score': 0.8212927756653993},
  'LF': {'precision': 0.8136272545090181,
   'recall': 0.8529411764705882,
   'f1_score': 0.8328205128205128}},
 'checkpoints/distilbert-uncased-best',
 {'retrained': True, 'train_time': '225.77s', 'epochs': 50})