# Libraries & Functions

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs
!pip install seqeval

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━

In [2]:
"""Connect to Huggingface Hub"""
!git config --global user.email "viktor.domazetoski@hotmail.com"
!git config --global user.name "ViktorDo1"

from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import numpy as np
import pandas as pd

In [4]:
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments
from transformers import Trainer

In [5]:
from torch.utils.data import DataLoader
import torch
from accelerate import Accelerator

In [6]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

In [7]:
import evaluate
metric = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [8]:
def dataset_sample(raw_datasets, label_name):
  i = np.random.randint(len(raw_datasets["train"]))
  words = raw_datasets["train"][i]["tokens"]
  labels = raw_datasets["train"][i][label_name]
  line1 = ""
  line2 = ""
  for word, label in zip(words, labels):
      full_label = label_names[label]
      max_length = max(len(word), len(full_label))
      line1 += word + " " * (max_length - len(word) + 1)
      line2 += full_label + " " * (max_length - len(full_label) + 1)

  print(line1)
  print(line2)

In [9]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, max_length = 512, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [11]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [12]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [13]:
# model_names = ["BERT", "BioBERT"]
# checkpoint_names = ["bert-base-cased", "dmis-lab/biobert-base-cased-v1.2"]

In [14]:
model_names = ["EcoBERT", "DistilBERT", "DeBERTa", "electra"]
checkpoint_names = ["ViktorDo/EcoBERT-Pretrained", "distilbert-base-uncased", "microsoft/deberta-v3-base", "google/electra-base-discriminator"]

# Input Data

In [15]:
raw_datasets = dict()
id2label = dict()
label2id = dict()

## conll 2003

In [None]:
dataset_name = "conll2003"
raw_datasets[dataset_name] = load_dataset("conll2003")

In [None]:
ner_feature = raw_datasets[dataset_name]["train"].features["ner_tags"]
label_names = ner_feature.feature.names
print("Dataset Label Names :", label_names)

Dataset Label Names : ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [None]:
id2label[dataset_name] = {i: label for i, label in enumerate(label_names)}
label2id[dataset_name] = {v: k for k, v in id2label[dataset_name].items()}

In [None]:
dataset_sample(raw_datasets[dataset_name], "ner_tags")

Police said the 111 passengers and six crew on board the ferry Trident Seven  , owned by France 's Emeraud line , were rescued by a variety of private and commercial boats after fire broke out in the engine room soon after it left port . 
O      O    O   O   O          O   O   O    O  O     O   O     B-MISC  I-MISC O O     O  B-LOC  O  B-ORG   O    O O    O       O  O O       O  O       O   O          O     O     O    O     O   O  O   O      O    O    O     O  O    O    O 


## Species 800

In [16]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [34]:
dataset_name = "S800"
data_location = "/content/drive/My Drive/NER/"

raw_datasets[dataset_name] = DatasetDict().load_from_disk(data_location + "/corpora/S800_GSC_brat/S800_HF")

In [35]:
raw_datasets[dataset_name]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 437
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 125
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 63
    })
})

In [18]:
ner_feature = raw_datasets[dataset_name]["train"].features["ner_tags"]
label_names = ner_feature.feature.names
print("Dataset Label Names :", label_names)

Dataset Label Names : ['O', 'B-Species', 'I-Species']


In [19]:
id2label[dataset_name] = {i: label for i, label in enumerate(label_names)}
label2id[dataset_name] = {v: k for k, v in id2label[dataset_name].items()}

In [20]:
dataset_sample(raw_datasets[dataset_name], "ner_tags")

Poliovirus disruption of cytoplasmic processing bodies. Metazoan cells form cytoplasmic mRNA granules such as stress granules (SG) and processing bodies (P bodies) that are proposed to be sites of aggregated, translationally silenced mRNAs and mRNA degradation. Poliovirus ([PV](Species)) is a plus-strand RNA virus containing a genome that is a functional mRNA; thus, we investigated if PV        antagonizes the processes that lead to formation of these structures. We have previously shown that PV        infection inhibits the ability of cells to form stress granules by cleaving RasGAP-SH3-binding protein (G3BP). Here, we show that P bodies are also disrupted during PV        infection in cells by 4 h postinfection. The disruption of P bodies is more rapid and more complete than disruption of stress granules. The kinetics of P body disruption correlated with production of viral proteinases and required substantial viral gene product expression. The organizing mechanism that forms P body 

## Copious

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [36]:
dataset_name = "copious"
data_location = "/content/drive/My Drive/NER/"

raw_datasets[dataset_name] = DatasetDict().load_from_disk(data_location + "/corpora/COPIOUS_GSC_brat/copious_HF")

In [37]:
raw_datasets[dataset_name]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 497
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 65
    })
    dev: Dataset({
        features: ['tokens', 'ner_tags', 'idx'],
        num_rows: 58
    })
})

In [23]:
ner_feature = raw_datasets[dataset_name]["train"].features["ner_tags"]
label_names = ner_feature.feature.names
print("Dataset Label Names :", label_names)

Dataset Label Names : ['O', 'B-LIVB', 'I-LIVB']


In [24]:
id2label[dataset_name] = {i: label for i, label in enumerate(label_names)}
label2id[dataset_name] = {v: k for k, v in id2label[dataset_name].items()}

In [25]:
dataset_sample(raw_datasets[dataset_name], "ner_tags")

MAMMALS OP THE PHILIPPINE ISLANDS. 33 Genus  CARPOMYS Thomas 1895. Carpomys Thomas Ann. and Mag. Nat. Hist, VI, 16, 161. Type. Carpomys melanurus Thomas Externally somewhat similar to Batomys; fur thick and wooly ; tail long, well haired, darker in color than body. General color deep fulvous, coarsely lined with black ; under parts dull yellowish-white. Head and body about 197 millimeters; tail, 175 to 211; hind foot, 31 to 34. Carpomys melanurus Thomas 1895. Carpomys melanurus Thomas Ann. and Mag. Nat. Hist, VI, 16, 162. Type locality. Highlands of northern Luzon. Luzon (Thomas). Carpomys phaeurus Thomas 1895. Carpomys phaeurus Thomas Ann. and Mag. Nat. Hist, VI, 16, 162. Type locality. Highlands of northern Luzon. Luzon (Thomas). Genus  CRATEROMYS Thomas 1895. Crateromys Thomas, Ann. and Mag. Nat. Hist, VI, 16, 163. Type. Phloeomys schadenbergi Meyer  Size very large ; externally somewhat like Phlosomys but with more bushy tail and much smaller claws. Muzzle pointed. Normal color bla

## Preprocess Data

In [26]:
tokenized_datasets_dict = {}

In [27]:
tokenizer_dict = {}

for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  if(model_checkpoint == "ViktorDo/EcoBERT-Pretrained"):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    tokenizer_dict[model_name] = AutoTokenizer.from_pretrained("distilbert-base-uncased")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tokenizer_dict[model_name] = AutoTokenizer.from_pretrained(model_checkpoint)
  for dataset_name in list(raw_datasets.keys())[:]:
    tokenized_datasets_dict[dataset_name, model_name] = raw_datasets[dataset_name].map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets[dataset_name]["train"].column_names,
    )

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/437 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

In [28]:
data_collator_dict = {}
for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  if(model_checkpoint == "ViktorDo/EcoBERT-Pretrained"):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
  else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  data_collator_dict[model_name] = DataCollatorForTokenClassification(tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Models

## Training

In [44]:
model_dict = {}

In [45]:
for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  for dataset_name in raw_datasets:
        model_dict[dataset_name, model_name] = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label[dataset_name],
        label2id=label2id[dataset_name],
    )

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at ViktorDo/EcoBERT-Pretrained and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForTokenClassification were not initialized from the model c

In [46]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [47]:
for model_name, model_checkpoint in zip(model_names[:], checkpoint_names[:]):
  for dataset_name in list(raw_datasets.keys())[::-1][:]:
      print(model_name, dataset_name)
      ner_feature = raw_datasets[dataset_name]["train"].features["ner_tags"]
      label_names = ner_feature.feature.names
      print("Dataset Label Names :", label_names)

      args = TrainingArguments(
          "{}-finetuned-ner-{}".format(model_name, dataset_name),
          evaluation_strategy="epoch",
          save_strategy="epoch",
          learning_rate=2e-5,
          num_train_epochs=5,
          weight_decay=0.01,
          push_to_hub=True,
      )

      if(dataset_name == "conll"):
        test_set_name = "validation"
      else:
        test_set_name = "test"

      trainer = Trainer(
          model=model_dict[dataset_name, model_name],
          args=args,
          train_dataset=tokenized_datasets_dict[dataset_name, model_name]["train"],
          eval_dataset=tokenized_datasets_dict[dataset_name, model_name][test_set_name],
          data_collator=data_collator_dict[model_name],
          compute_metrics=compute_metrics,
          tokenizer=tokenizer_dict[model_name],
      )
      trainer.train()

      trainer.push_to_hub()

EcoBERT copious
Dataset Label Names : ['O', 'B-LIVB', 'I-LIVB']


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.132801,0.258621,0.304348,0.279627,0.952217
2,No log,0.088482,0.487562,0.568116,0.524766,0.968805
3,No log,0.081585,0.551351,0.591304,0.570629,0.972018
4,No log,0.076445,0.608637,0.653623,0.630328,0.974843
5,No log,0.076376,0.614362,0.669565,0.640777,0.974745


EcoBERT S800
Dataset Label Names : ['O', 'B-Species', 'I-Species']


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.127767,0.313636,0.387097,0.346516,0.94656
2,No log,0.090395,0.445465,0.54418,0.489899,0.965711
3,No log,0.082496,0.509583,0.633941,0.565,0.969149
4,No log,0.07948,0.508314,0.600281,0.550482,0.969865
5,No log,0.083026,0.527778,0.612903,0.567164,0.969819


DistilBERT copious
Dataset Label Names : ['O', 'B-LIVB', 'I-LIVB']


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.132207,0.312893,0.288406,0.300151,0.952899
2,No log,0.08421,0.519004,0.573913,0.545079,0.971142
3,No log,0.077183,0.576455,0.617391,0.596221,0.974031
4,No log,0.075068,0.603518,0.646377,0.624213,0.975102
5,No log,0.075541,0.605615,0.656522,0.630042,0.975232


DistilBERT S800
Dataset Label Names : ['O', 'B-Species', 'I-Species']


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.136278,0.245958,0.298738,0.269791,0.943606
2,No log,0.092592,0.394299,0.465638,0.42701,0.963612
3,No log,0.082325,0.493714,0.605891,0.544081,0.968273
4,No log,0.080176,0.518657,0.584853,0.549769,0.969588
5,No log,0.083354,0.532927,0.612903,0.570124,0.968896


DeBERTa copious
Dataset Label Names : ['O', 'B-LIVB', 'I-LIVB']


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.063205,0.679341,0.738292,0.707591,0.97894
2,No log,0.05074,0.755945,0.831956,0.792131,0.983719
3,No log,0.05169,0.777062,0.830579,0.802929,0.983983
4,No log,0.051684,0.782166,0.84573,0.812707,0.983917
5,No log,0.049942,0.786736,0.833333,0.809365,0.98418


DeBERTa S800
Dataset Label Names : ['O', 'B-Species', 'I-Species']


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.074395,0.58396,0.652661,0.616402,0.97033
2,No log,0.063911,0.633218,0.768908,0.694497,0.976445
3,No log,0.058499,0.642445,0.780112,0.704617,0.976627
4,No log,0.058064,0.675386,0.795518,0.730547,0.978466
5,No log,0.06058,0.673031,0.789916,0.726804,0.978311


electra copious
Dataset Label Names : ['O', 'B-LIVB', 'I-LIVB']


You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.099864,0.492669,0.486957,0.489796,0.967506
2,No log,0.067748,0.672799,0.742029,0.70572,0.978316
3,No log,0.064838,0.715328,0.710145,0.712727,0.979517
4,No log,0.061943,0.729917,0.763768,0.746459,0.980913
5,No log,0.061359,0.736111,0.768116,0.751773,0.981432


electra S800
Dataset Label Names : ['O', 'B-Species', 'I-Species']


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.111516,0.473616,0.516129,0.49396,0.955213
2,No log,0.076543,0.578883,0.669004,0.62069,0.972057
3,No log,0.071121,0.56708,0.70547,0.62875,0.97298
4,No log,0.069845,0.626551,0.708275,0.664911,0.975333
5,No log,0.069678,0.614646,0.718093,0.662354,0.975795


## Evaluation

In [48]:
results_dict = dict()
results_list = []
for model_name, model_checkpoint in zip(model_names, checkpoint_names):
  for dataset_name in list(raw_datasets.keys())[::-1][:]:

    train_dataloader = DataLoader(
        tokenized_datasets_dict[dataset_name, model_name]["train"],
        shuffle=True,
        collate_fn=data_collator_dict[model_name],
        batch_size=8,
    )
    if(dataset_name == "conll"):
      test_set_name = "validation"
    else:
      test_set_name = "test"
    eval_dataloader = DataLoader(
        tokenized_datasets_dict[dataset_name, model_name][test_set_name], collate_fn=data_collator_dict[model_name], batch_size=8
    )

    accelerator = Accelerator()
    model, train_dataloader, eval_dataloader = accelerator.prepare(
        model_dict[dataset_name, model_name], train_dataloader, eval_dataloader
    )

    labels_list = []
    prediction_list = []

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

        labels_list.append(true_labels)
        prediction_list.append(true_predictions)

    results = metric.compute()
    results_dict[dataset_name, model_name] = results
    results_list.append([dataset_name, model_name] +[results[f"overall_{key}"] for key in ["accuracy", "precision", "recall", "f1"]])
    print("Dataset:", dataset_name)
    print("Model:", model_name)
    print(
        f"Results:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )
    print()

df_results = pd.DataFrame(results_list, columns = ["Dataset", "Model", "Accuracy", "Precision", "Recall", "F1-Score"])

Dataset: copious
Model: EcoBERT
Results: {'precision': 0.6695652173913044, 'recall': 0.6143617021276596, 'f1': 0.6407766990291263, 'accuracy': 0.974745179510485}

Dataset: S800
Model: EcoBERT
Results: {'precision': 0.6129032258064516, 'recall': 0.5277777777777778, 'f1': 0.5671641791044777, 'accuracy': 0.9698186349162398}

Dataset: copious
Model: DistilBERT
Results: {'precision': 0.6565217391304348, 'recall': 0.6056149732620321, 'f1': 0.6300417246175244, 'accuracy': 0.9752320976433162}

Dataset: S800
Model: DistilBERT
Results: {'precision': 0.6129032258064516, 'recall': 0.5329268292682927, 'f1': 0.5701239399869537, 'accuracy': 0.9688956573907426}

Dataset: copious
Model: DeBERTa
Results: {'precision': 0.8333333333333334, 'recall': 0.7867360208062418, 'f1': 0.8093645484949833, 'accuracy': 0.9841803440775163}

Dataset: S800
Model: DeBERTa
Results: {'precision': 0.7899159663865546, 'recall': 0.6730310262529833, 'f1': 0.7268041237113403, 'accuracy': 0.9783110051566427}

Dataset: copious
Mod

In [49]:
df_results

Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,F1-Score
0,copious,EcoBERT,0.974745,0.669565,0.614362,0.640777
1,S800,EcoBERT,0.969819,0.612903,0.527778,0.567164
2,copious,DistilBERT,0.975232,0.656522,0.605615,0.630042
3,S800,DistilBERT,0.968896,0.612903,0.532927,0.570124
4,copious,DeBERTa,0.98418,0.833333,0.786736,0.809365
5,S800,DeBERTa,0.978311,0.789916,0.673031,0.726804
6,copious,electra,0.981432,0.768116,0.736111,0.751773
7,S800,electra,0.975795,0.718093,0.614646,0.662354


In [50]:
df_results.to_excel("NER_Results.xlsx", index = False)