# Preparation

In [None]:
# Install dependencies
# %pip install -q -U ipywidgets transformers tqdm
# %pip install -q -U seqeval
# %pip install -q -U accelerate
# %pip install -q -U transformers[torch]
# %pip install -q --upgrade -U torch torchvision torchaudio torchtext
# %pip install -q dill==0.3.1.1
# %pip install -q numpy==1.14.3
# %pip install -q pyarrow==0.3.8
# %pip install -q multiprocess==0.70.16
# %pip install -q -U datasets==2.6.0
# %pip install fsspec==2023.9.2

## Set Seed and CUDA

In [1]:
import datasets
print(datasets.__version__)

2.18.0


In [2]:
import torch
import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

PyTorch Version:  2.2.1+cu121
torchtext Version:  0.17.1+cpu
Using GPU.


## Data Prep

### Download the Dataset
this will download the huggingface dataset ready for use

In [2]:
from datasets import load_dataset, Features, Value
# import pyarrow as pa
# data_type = pa.list_(pa.string())
# context_feat = Features({'tokens': Value(dtype="string"), 'pos_tags':Value(dtype="string"), 'ner_tags':Value(dtype="string")})
# dataset = load_dataset("surrey-nlp/PLOD-CW", features=context_feat)
dataset = load_dataset("surrey-nlp/PLOD-CW", cache_dir=None, download_mode="force_redownload")

Downloading readme:   0%|          | 0.00/8.37k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 188k/188k [00:00<00:00, 534kB/s]
Downloading data: 100%|██████████| 28.4k/28.4k [00:00<00:00, 118kB/s]
Downloading data: 100%|██████████| 28.7k/28.7k [00:00<00:00, 119kB/s]


Generating train split:   0%|          | 0/1072 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/153 [00:00<?, ? examples/s]

In [3]:
print(type(dataset))

<class 'datasets.dataset_dict.DatasetDict'>


### Get Label List
this gets the list of labels for the dataset

In [4]:
label_list = ['B-O', 'B-AC', 'B-LF', 'I-LF']
print(label_list)

['B-O', 'B-AC', 'B-LF', 'I-LF']


### Split The Set Into Train, Val and Test Sets

In [7]:
train = dataset['train']
print(f"train size: {len(train)}")
val = dataset['validation']
print(f"val size: {len(val)}")
test = dataset['test']
print(f"test size: {len(test)}")

def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

from collections import Counter
flat = flatten(train["ner_tags"])
print(Counter(flat))

train size: 1072
val size: 126
test size: 153
Counter({'B-O': 32971, 'I-LF': 3231, 'B-AC': 2336, 'B-LF': 1462})


## Data visualization
Here I visualize the dataset to be used for this course work and analyse its features

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import os

def analyze_nlp_dataset(data, output_folder):
  """
  Analyzes and visualizes an NLP dataset with tokens, POS tags, and NER tags.

  Args:
      data: A dictionary containing separate lists for tokens, POS tags, and NER tags.
          - data["tokens"]: A list of lists of tokens.
          - data["pos_tags"]: A list of lists of POS tags.
          - data["ner_tags"]: A list of lists of NER tags.
      output_folder: The folder path to save generated plots.
  """

  try:
    os.mkdir(output_folder)
  except FileExistsError:
    pass  # Folder already exists, continue

  # POS Tag Analysis
  all_pos_tags = [pos_tag for row in data["pos_tags"] for pos_tag in row]
  pos_tag_counts = Counter(all_pos_tags)

  # Plot POS tag distribution
  plt.figure(figsize=(8, 6))
  plt.pie(pos_tag_counts.values(), labels=pos_tag_counts.keys(), autopct="%1.1f%%")
  plt.title("POS Tag Distribution")
  plt.savefig(f"{output_folder}/pos_tag_distribution.png")
  plt.close()

  # NER Tag Analysis
  all_ner_tags = [ner_tag for row in data["ner_tags"] for ner_tag in row]
  ner_tag_counts = Counter(all_ner_tags)

  # Plot NER tag distribution (if any named entities exist)
  if ner_tag_counts:
    plt.figure(figsize=(8, 6))
    plt.bar(ner_tag_counts.keys(), ner_tag_counts.values())
    plt.xlabel("NER Tag")
    plt.ylabel("Frequency")
    plt.title("NER Tag Distribution")
    plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for better readability
    plt.tight_layout()
    plt.savefig(f"{output_folder}/ner_tag_distribution.png")
    plt.close()
  else:
    print("No named entity tags found in the data for NER tag analysis.")

  # Analysis of POS tags within NER tags
  pos_in_ner_tags = {}
  for tokens, pos_tags, ner_tags in zip(data["tokens"], data["pos_tags"], data["ner_tags"]):
    for token, pos_tag, ner_tag in zip(tokens, pos_tags, ner_tags):
      if ner_tag and ner_tag != "O":  # Consider only named entity tags (excluding "O")
        pos_in_ner_tags.setdefault(ner_tag, []).append(pos_tag)

  # Calculate POS tag proportions within each NER tag (if data exists)
  if pos_in_ner_tags:
    for ner_tag, pos_tag_list in pos_in_ner_tags.items():
      pos_tag_counts_in_ner = Counter(pos_tag_list)
      total_count = sum(pos_tag_counts_in_ner.values())
      pos_in_ner_tags[ner_tag] = {tag: count / total_count for tag, count in pos_tag_counts_in_ner.items()}

  # Print insights from POS tags within NER tags analysis (optional)
  if pos_in_ner_tags:
    print("\nInsights from POS tags within NER tags:")
    for ner_tag, pos_tag_proportions in pos_in_ner_tags.items():
      print(f"- NER Tag: {ner_tag}")
      for pos_tag, proportion in pos_tag_proportions.items():
        print(f"  - Proportion of {pos_tag}: {proportion:.2f}")


    # Visualize POS tags within NER tags (if data exists)
  if pos_in_ner_tags:
    for ner_tag, pos_tag_proportions in pos_in_ner_tags.items():
      plt.figure(figsize=(8, 6))
      plt.bar(pos_tag_proportions.keys(), pos_tag_proportions.values())
      plt.xlabel("POS Tag")
      plt.ylabel("Proportion")
      plt.title(f"POS Tag Proportions within NER Tag: {ner_tag}")
      plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for better readability
      plt.tight_layout()
      plt.savefig(f"{output_folder}/pos_in_ner_{ner_tag}.png")
      plt.close()

  print("Analysis complete. Plots saved to", output_folder)

In [None]:
analyze_nlp_dataset(train, "train_set_analysis")
analyze_nlp_dataset(val, "val_set_analysis")
analyze_nlp_dataset(test, "test_set_analysis")

## Data Pre-Processing

### Lemmatization

In [7]:
import nltk
nltk.download('wordnet')

# import subprocess

# # Download and unzip wordnet
# try:
#     nltk.data.find('wordnet.zip')
# except:
#     nltk.download('wordnet', download_dir='/kaggle/working/')
#     command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
#     subprocess.run(command.split())
#     nltk.data.path.append('/kaggle/working/')

# # Now you can import the NLTK resources as usual
# from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harry\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

the `combine_lists_elementwise` function turns two lists into one, pairing each element elementwise, maintaining the shape of the original list

In [8]:
def combine_lists_elementwise(list_A, list_B):
  """
  Combines two 2D lists of strings element-wise into a 2D list of tuples.

  Args:
      list_A: A 2D list of strings (e.g., [['A', 'A', 'A'], ['A', 'A', 'A']]).
      list_B: Another 2D list of strings with the same dimensions as list_A.

  Returns:
      A 2D list of tuples, where each tuple combines corresponding elements from list_A and list_B.

  Raises:
      ValueError: If the dimensions of list_A and list_B don't match.
  """

  # Check if dimensions match
  if len(list_A) != len(list_B) or len(list_A[0]) != len(list_B[0]):
    raise ValueError("Dimensions of lists A and B must be equal.")

  # Create the resulting list using list comprehension
  return [[(a, b) for a, b in zip(row_a, row_b)] for row_a, row_b in zip(list_A, list_B)]

the nltk lemmatize function takes a certain format for POS_tags so the `convert_pos_tag` maps a POS_tag from the dataset, to one in the required format. Its important to note that alot of data is lost due to the simplicity of the nltk lemmatize function

In [9]:
def convert_pos_tag(nltk_tag):
    """
    Converts NLTK POS tags to the format expected by the lemmatizer.

    Args:
        nltk_tag: The POS tag in NLTK format (e.g., VBG, NNS).

    Returns:
        The corresponding POS tag for the lemmatizer (n, v, a, r, or s) or None if no match.
    """

    tag_map = {
        'NUM': '',  # Number (not handled by lemmatizer)
        'CCONJ': '',  # Coordinating conjunction (not handled)
        'PRON': '',  # Pronoun (not handled)
        'NOUN': 'n',   # Noun
        'SCONJ': '',  # Subordinating conjunction (not handled)
        'SYM': '',   # Symbol (not handled)
        'INTJ': '',  # Interjection (not handled)
        'ADJ': 'a',    # Adjective
        'ADP': '',   # Preposition (not handled)
        'PUNCT': '',  # Punctuation (not handled)
        'ADV': 'r',    # Adverb
        'AUX': 'v',    # Auxiliary verb
        'DET': '',   # Determiner (not handled)
        'VERB': 'v',   # Verb
        'X': '',      # Other (not handled)
        'PART': '',   # Particle (not handled)
        'PROPN': 'n',   # Proper noun
    }
    return tag_map.get(nltk_tag)

the `lemmatize_list` function takes the tokens and their respective pos_tags and lemmatizes the tokens

In [11]:
def lemmatize_list(data, pos_tags):
    """
    Lemmatizes a 2D list of tokens using NLTK.

    Args:
        data: A 2D list of strings (tokens) to be lemmatized.

    Returns:
        A 2D list containing the lemmatized tokens.
    """

    # Initialize the WordNet lemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    pos_tags = [[convert_pos_tag(tag) for tag in row] for row in pos_tags]

    data = combine_lists_elementwise(data, pos_tags)


    # Lemmatize with part-of-speech information
    lemmatized_data = [[token if pos == '' else lemmatizer.lemmatize(token, pos) for token, pos in row] for row in data]

    return lemmatized_data

### Pre-Processing Pipeline
the `pre_process_data` function applies lemmatization and lowercase to the given data

In [12]:
def pre_process_data(tokens, pos_tags):
    # lemmatize the data
    data = lemmatize_list(tokens, pos_tags)
    # lowercase the data
    data = [[string.lower() for string in row] for row in data]
    return data

In [13]:
from datasets import DatasetDict, Dataset
train_tokens = pre_process_data(train["tokens"], train["pos_tags"])
val_tokens = pre_process_data(val["tokens"], val["pos_tags"])
test_tokens = pre_process_data(test["tokens"], test["pos_tags"])
original_train_tokens = train["tokens"]
original_val_tokens = val["tokens"]
print(f"original train tokens: {original_train_tokens[0]}\npre-processed train tokens: {train_tokens[0]}")
print(f"original val tokens: {original_val_tokens[0]}\npre-processed val tokens: {val_tokens[0]}")

dataset = DatasetDict({
    "train": Dataset.from_dict({"tokens": train_tokens, "pos_tags": train["pos_tags"], "ner_tags": train["ner_tags"]}),
    "validation": Dataset.from_dict({"tokens": val_tokens, "pos_tags": val["pos_tags"], "ner_tags": val["ner_tags"]}),
    "test": Dataset.from_dict({"tokens": test_tokens, "pos_tags": test["pos_tags"], "ner_tags": test["ner_tags"]}),
})

original train tokens: ['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(', 'GYPES', ')', 'was', 'developed', '.']
pre-processed train tokens: ['for', 'this', 'purpose', 'the', 'gothenburg', 'young', 'persons', 'empowerment', 'scale', '(', 'gypes', ')', 'be', 'develop', '.']
original val tokens: ['=', 'Manual', 'Ability', 'Classification', 'System', ';', 'QUEST', '=', 'Quest', '-', 'Quality', 'of', 'upper', 'extremity', 'skills', 'test', ';', 'Cont', '=', 'control', ';', 'M', '=', 'male', ',', 'F', '=', 'female', ',', 'V', '=', 'verbal', ',', 'nonV', '=', 'non', '-', 'Verbal', ',', '|Quad', '=', 'quadriplegia', ',', 'Di', '=', 'Diplegia', ',', 'Hemi', '=', 'hemiplegia', '.']
pre-processed val tokens: ['=', 'manual', 'ability', 'classification', 'system', ';', 'quest', '=', 'quest', '-', 'quality', 'of', 'upper', 'extremity', 'skill', 'test', ';', 'cont', '=', 'control', ';', 'm', '=', 'male', ',', 'f', '=', 'female', ',', 'v', '=', 'verbal', 

### Set Task
in this project we are doing Named Entity Recognition so I set the task to "ner"

In [14]:
task = "ner"

# Experiment 1 (Model)
HMM vs BERT

## HMM
The following is the implementation of an HMM model

### Library Import
I am using the nltk library for the HMM implementation

In [15]:
import nltk
from sklearn.metrics import classification_report

create lists of the sentences and associated tags from the train set

In [16]:
# sentences = train[:]["tokens"]
# tags = train[:]["ner_tags"]

sentences = train_tokens
tags = train[:]["ner_tags"]

print out an example of the first sentence and its tags

In [17]:
print(f"sentence: {sentences[0]}")
print(f"tags: {tags[0]}")

sentence: ['for', 'this', 'purpose', 'the', 'gothenburg', 'young', 'persons', 'empowerment', 'scale', '(', 'gypes', ')', 'be', 'develop', '.']
tags: ['B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O']


we generate a character set containing all the characters that can be used in the output of the model

In [18]:
def get_char_set(sentences):
    char_set = set()
    for sentence in sentences:
        for word in sentence:
            for char in word:
                char_set.add(char)
    char_set = list(char_set)
    return char_set

In [19]:
char_set = get_char_set(sentences)
print(f"char_set: {char_set}")

char_set: ['[', '∞', 'i', 'y', ')', '…', 'ν', 'p', '¯', '+', 'κ', '}', '§', 'γ', 'x', '♂', '{', '‡', '（', ',', '”', '3', '=', 'g', '/', 'e', '•', '−', '%', '′', 'ä', 'μ', ':', 's', 'ü', '"', 'σ', '´', 'q', '°', '5', '±', '7', 'ω', '&', 'k', 'b', "'", '×', 'ã', 'o', 'φ', '≤', 'β', 'h', '1', '4', '–', '.', '8', 'w', '“', ';', 'd', 'ß', 'a', '@', '♀', '>', '®', '≥', '9', 'ö', 'l', 't', 'å', 'ú', '$', 'é', '(', '→', 'f', 'µ', '‘', '—', 'm', 'ε', '2', '_', '0', 'λ', 'u', '-', 'j', '’', 'ó', 'θ', '*', 'è', 'ï', '†', 'c', '6', '）', 'α', '<', '·', 'v', ']', 'í', '‒', 'z', '#', '∑', '?', 'n', 'δ', 'r']


In [20]:
trainer = nltk.tag.hmm.HiddenMarkovModelTrainer(states=label_list, symbols=char_set)
data = combine_lists_elementwise(sentences.copy(), tags.copy())
print(data[0])

[('for', 'B-O'), ('this', 'B-O'), ('purpose', 'B-O'), ('the', 'B-O'), ('gothenburg', 'B-LF'), ('young', 'I-LF'), ('persons', 'I-LF'), ('empowerment', 'I-LF'), ('scale', 'I-LF'), ('(', 'B-O'), ('gypes', 'B-AC'), (')', 'B-O'), ('be', 'B-O'), ('develop', 'B-O'), ('.', 'B-O')]


## BERT

The following is the implementation of BERT model

### Tokenizer

In [21]:
from transformers import AutoTokenizer
import transformers
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) # use AutoTokenizer because it defaults to fast tokenizers where as using the BERT Tokenizer does not
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

#### Quick Example

In [15]:
# print an example tokenized text
# example = train[0]
# tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
# tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
# print(example["tokens"])
# print(tokens)

['For', 'this', 'purpose', 'the', 'Gothenburg', 'Young', 'Persons', 'Empowerment', 'Scale', '(', 'GYPES', ')', 'was', 'developed', '.']
['[CLS]', 'for', 'this', 'purpose', 'the', 'gothenburg', 'young', 'persons', 'empowerment', 'scale', '(', 'g', '##ype', '##s', ')', 'was', 'developed', '.', '[SEP]']


In [16]:
# check the length of tokens is the same as in the dataset sample
# len(example[f"{task}_tags"]), len(tokenized_input["input_ids"])
# print(tokenized_input.word_ids())

# word_ids = tokenized_input.word_ids()
# aligned_labels = [-100 if i is None else example[f"{task}_tags"][i] for i in word_ids]
# print(len(aligned_labels), len(tokenized_input["input_ids"])) # if it prints the same number twice, then everything is working

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 11, 12, 13, 14, None]
19 19


I need to map the string tokens to numbers

In [22]:
def encode_tags(tag_sequences, possible_tags):
    """
    Encodes a sequence of string tags into a list of corresponding integer tags.

    Args:
        tag_sequences: A 2d list of strings representing numerical tags.
        possible_tags: A list of strings representing the possible textual labels.

    Returns:
        A list of strings representing the decoded textual tags.
    """

    encoded_tags = [[possible_tags.index(tag) for tag in row] for row in tag_sequences]
    return encoded_tags

In [23]:
label_all_tokens = True
def tokenize_and_align_labels(data):
    tokenized_inputs = tokenizer(data["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    converted_tags = encode_tags(data[f"{task}_tags"], label_list)
    for i, label in enumerate(converted_tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [24]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

Map:   0%|          | 0/1072 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Map:   0%|          | 0/153 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [25]:
model_name = model_checkpoint.split("/")[-1]
batch_size = 16
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=1000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=['none'],
)

In [26]:
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")
# labels = [label_list[i] for i in example[f"{task}_tags"]]
# metric.compute(predictions=[labels], references=[labels])

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [27]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    overall_results = metric.compute(predictions=true_predictions, references=true_labels)
    
    true_labels = [item for sublist in true_labels for item in sublist]
    true_predictions = [item for sublist in true_predictions for item in sublist]
    
    
    results = classification_report(true_labels, true_predictions, labels = label_list)
    print(results)
    return {
        "precision": overall_results["overall_precision"],
        "recall": overall_results["overall_recall"],
        "f1": overall_results["overall_f1"],
        "accuracy": overall_results["overall_accuracy"],
    }

In [28]:
BERTtrainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

## Training

#### HMM Training

set up the hmm trainer and combine the tokens with their tags

train the model on the data

In [30]:
model = trainer.train_supervised(data)

save the model

In [31]:
import dill
def save_hmm(model, name):
    # Open a file for writing in binary mode
    with open(name, 'wb') as f:
        # Dill can handle more complex objects than pickle
        dill.dump(model, f)

    print(f"Model saved as {name}")



Model saved as hmm_model.dill


In [None]:
save_hmm(model, "hmm_model.dill")

load the model

In [32]:
def load_hmm(name):
    # Open the saved model file in binary read mode
    with open(name, 'rb') as f:
        # Load the model back into a variable using dill.load
        model = dill.load(f)
        print("Model loaded successfully!")
    return model

In [33]:
model = load_hmm("hmm_model.dill")

Model loaded successfully!


#### BERT Training

clear the cuda cache to avoid cuda memory issues

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

train bert

In [None]:
BERTtrainer.train()

In [None]:
BERTtrainer.model.save_pretrained("/kaggle/working/BERT_save")

In [None]:
BERTtrainer.model.from_pretrained("/kaggle/working/BERT_save")

## Evaluation

#### HMM evaluation

In [34]:
def evaluate_hmm(model, test_sentences):
    predicted = []
    for sentence in test_sentences:
        test_result = model.tag(sentence)
        out_tags = []
        for word, tag in test_result:
            out_tags.append(tag)
        predicted.append(out_tags)
    return predicted

In [35]:
test_sentences = dataset["validation"][:]["tokens"]
test_sentences = pre_process_data(test_sentences, dataset["validation"][:]["pos_tags"])
correct_tags = dataset["validation"][:]["ner_tags"]
predicted = evaluate_hmm(model, test_sentences)

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


In [36]:
correct_tags = [item for sublist in correct_tags for item in sublist]
predicted = [item for sublist in predicted for item in sublist]
print(f"number of predictions: {len(predicted)}\nnumber of correct answers: {len(correct_tags)}")

print(correct_tags[:100])
print(predicted[:100])

number of predictions: 5000
number of correct answers: 5000
['B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'I-LF', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF', 'B-O', 'B-AC', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-LF', 'I-LF', 'I-LF']
['B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', 'B-O', '

In [37]:
print(classification_report(correct_tags, predicted, labels = label_list))

              precision    recall  f1-score   support

         B-O       0.87      0.99      0.93      4261
        B-AC       0.83      0.11      0.19       263
        B-LF       0.54      0.10      0.17       149
        I-LF       0.72      0.13      0.22       327

    accuracy                           0.86      5000
   macro avg       0.74      0.33      0.38      5000
weighted avg       0.85      0.86      0.82      5000



#### BERT evaluation

In [38]:
model = AutoModelForTokenClassification.from_pretrained("model_saves\\BERT_save", num_labels=len(label_list))
BERTtrainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [39]:
metrics = BERTtrainer.evaluate()
print(metrics)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.93      0.95      0.94      5197
        B-AC       0.73      0.74      0.74       563
        B-LF       0.64      0.25      0.36       290
        I-LF       0.60      0.69      0.64       487

    accuracy                           0.88      6537
   macro avg       0.73      0.66      0.67      6537
weighted avg       0.88      0.88      0.87      6537

{'eval_loss': 0.3339511752128601, 'eval_precision': 0.887690044139284, 'eval_recall': 0.8970758301668594, 'eval_f1': 0.8923582580115037, 'eval_accuracy': 0.8811381367599816, 'eval_runtime': 2.8251, 'eval_samples_per_second': 44.6, 'eval_steps_per_second': 44.6}


# Experiment 2 (Loss Functions)
cross entropy vs MSE
This experiment is on BERT as HMM doesnt use loss functions due to its statistical nature rather than being a neural network

## Creating Custom Trainers

In [57]:
from transformers import Trainer
import torch.nn as nn

# cross entropy
class CustomBERTTrainerCrossEntropy(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.CrossEntropyLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            loss = loss*loss
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss
    
    
# MLSML
class CustomBERTTrainerMLSML(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.MultiLabelSoftMarginLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            loss = loss*loss
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss
    
    
# KLDivLoss
class CustomBERTTrainerKLDiv(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.KLDivLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            loss = loss*loss
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss
    

# MSE
class CustomBERTTrainerMSE(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.MSELoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            loss = loss*loss
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss
  

In [58]:
args_CE = TrainingArguments(
    f"{model_name}-finetuned-CE-{task}",
    evaluation_strategy ='steps',
    eval_steps = 10,
    logging_steps = 10,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=1000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=['tensorbaord'],
)

args_MLSML = TrainingArguments(
    f"{model_name}-finetuned-MLSML-{task}",
    evaluation_strategy ='steps',
    eval_steps = 10,
    logging_steps = 10,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=1000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=['tensorbaord'],
)

args_KLDiv = TrainingArguments(
    f"{model_name}-finetuned-KLDiv-{task}",
    evaluation_strategy ='steps',
    eval_steps = 10,
    logging_steps = 10,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=1000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=['tensorbaord'],
)

args_MSE = TrainingArguments(
    f"{model_name}-finetuned-MSE-{task}",
    evaluation_strategy ='steps',
    eval_steps = 10,
    logging_steps = 10,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=1000,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=['tensorbaord'],
)
model_CE = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_MLSML = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_KLDiv = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_MSE = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

BERTtrainer_CE = CustomBERTTrainerCrossEntropy(
    model_CE,
    args_CE,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_MLSML = CustomBERTTrainerMLSML(
    model_MLSML,
    args_MLSML,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_KLDiv = CustomBERTTrainerKLDiv(
    model_KLDiv,
    args_KLDiv,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_MSE = CustomBERTTrainerMSE(
    model_MSE,
    args_MSE,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

## Training

In [38]:
import gc
torch.cuda.empty_cache()
gc.collect()

37

In [39]:
BERTtrainer_CE.train()
BERTtrainer_CE.model.save_pretrained("/kaggle/working/BERT_CE_save")
BERTtrainer_CE = None
torch.cuda.empty_cache()
gc.collect()

BERTtrainer_MLSML.train()
BERTtrainer_MLSML.model.save_pretrained("/kaggle/working/BERT_MLSML_save")
BERTtrainer_MLSML = None
torch.cuda.empty_cache()
gc.collect()

BERTtrainer_KLDiv.train()
BERTtrainer_KLDiv.model.save_pretrained("/kaggle/working/BERT_KLDiv_save")
BERTtrainer_KLDiv = None
torch.cuda.empty_cache()
gc.collect()

BERTtrainer_MSE.train()
BERTtrainer_MSE.model.save_pretrained("/kaggle/working/BERT_MSE_save")
BERTtrainer_MSE = None
torch.cuda.empty_cache()
gc.collect()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.155046,0.904087,0.906327,0.905206,0.898577
100,0.231300,0.110292,0.919907,0.910788,0.915325,0.911274
150,0.231300,0.105445,0.925108,0.92037,0.922733,0.918923
200,0.060900,0.105013,0.92744,0.918553,0.922975,0.918311
250,0.060900,0.107633,0.927849,0.92417,0.926006,0.921371
300,0.034200,0.105874,0.929143,0.9207,0.924902,0.919535


              precision    recall  f1-score   support

         B-O       0.95      0.95      0.95      5197
        B-AC       0.73      0.84      0.78       563
        B-LF       0.80      0.36      0.50       290
        I-LF       0.68      0.77      0.72       487

    accuracy                           0.90      6537
   macro avg       0.79      0.73      0.74      6537
weighted avg       0.90      0.90      0.90      6537

              precision    recall  f1-score   support

         B-O       0.96      0.94      0.95      5197
        B-AC       0.78      0.81      0.79       563
        B-LF       0.66      0.65      0.65       290
        I-LF       0.73      0.84      0.78       487

    accuracy                           0.91      6537
   macro avg       0.78      0.81      0.80      6537
weighted avg       0.91      0.91      0.91      6537

              precision    recall  f1-score   support

         B-O       0.96      0.95      0.95      5197
        B-AC       0.

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.162805,0.897042,0.896745,0.896894,0.884351
100,0.261100,0.127736,0.916723,0.903849,0.91024,0.906226
150,0.261100,0.111125,0.924225,0.920866,0.922542,0.918617
200,0.067400,0.105358,0.927163,0.916901,0.922003,0.91724
250,0.067400,0.112019,0.926676,0.922848,0.924758,0.919688
300,0.037100,0.111976,0.928512,0.922683,0.925588,0.920606


              precision    recall  f1-score   support

         B-O       0.94      0.94      0.94      5197
        B-AC       0.70      0.81      0.75       563
        B-LF       0.71      0.20      0.32       290
        I-LF       0.60      0.77      0.68       487

    accuracy                           0.88      6537
   macro avg       0.74      0.68      0.67      6537
weighted avg       0.89      0.88      0.88      6537

              precision    recall  f1-score   support

         B-O       0.96      0.93      0.95      5197
        B-AC       0.75      0.81      0.78       563
        B-LF       0.68      0.66      0.67       290
        I-LF       0.69      0.87      0.77       487

    accuracy                           0.91      6537
   macro avg       0.77      0.82      0.79      6537
weighted avg       0.91      0.91      0.91      6537

              precision    recall  f1-score   support

         B-O       0.96      0.95      0.95      5197
        B-AC       0.

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.166789,0.897635,0.896745,0.89719,0.886798
100,0.259500,0.118887,0.91946,0.910953,0.915187,0.911121
150,0.259500,0.103687,0.923852,0.924005,0.923928,0.920606
200,0.065600,0.102177,0.930271,0.923509,0.926878,0.921371
250,0.065600,0.101526,0.92882,0.924831,0.926821,0.922136
300,0.036300,0.104929,0.93009,0.925326,0.927702,0.9229


              precision    recall  f1-score   support

         B-O       0.96      0.93      0.94      5197
        B-AC       0.67      0.90      0.77       563
        B-LF       0.68      0.24      0.35       290
        I-LF       0.62      0.79      0.70       487

    accuracy                           0.89      6537
   macro avg       0.73      0.71      0.69      6537
weighted avg       0.89      0.89      0.88      6537

              precision    recall  f1-score   support

         B-O       0.97      0.94      0.95      5197
        B-AC       0.76      0.85      0.80       563
        B-LF       0.68      0.67      0.68       290
        I-LF       0.71      0.84      0.77       487

    accuracy                           0.91      6537
   macro avg       0.78      0.82      0.80      6537
weighted avg       0.92      0.91      0.91      6537

              precision    recall  f1-score   support

         B-O       0.96      0.95      0.96      5197
        B-AC       0.

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
50,No log,0.166362,0.895016,0.892946,0.893979,0.884504
100,0.266700,0.123942,0.917902,0.910623,0.914248,0.909745
150,0.266700,0.113495,0.925729,0.922518,0.924121,0.919994
200,0.067800,0.1047,0.929982,0.925987,0.92798,0.922594
250,0.067800,0.106556,0.929743,0.926978,0.928359,0.922136
300,0.038900,0.11179,0.932493,0.928796,0.930641,0.924583


              precision    recall  f1-score   support

         B-O       0.95      0.93      0.94      5197
        B-AC       0.66      0.88      0.76       563
        B-LF       0.68      0.22      0.34       290
        I-LF       0.61      0.78      0.68       487

    accuracy                           0.88      6537
   macro avg       0.73      0.70      0.68      6537
weighted avg       0.89      0.88      0.88      6537

              precision    recall  f1-score   support

         B-O       0.96      0.94      0.95      5197
        B-AC       0.77      0.84      0.80       563
        B-LF       0.67      0.64      0.66       290
        I-LF       0.71      0.84      0.77       487

    accuracy                           0.91      6537
   macro avg       0.78      0.82      0.79      6537
weighted avg       0.91      0.91      0.91      6537

              precision    recall  f1-score   support

         B-O       0.96      0.95      0.96      5197
        B-AC       0.

196

In [40]:
model_CE = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_CE_save", num_labels=len(label_list))
model_MLSML = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_MLSML_save", num_labels=len(label_list))
model_KLDiv = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_KLDiv_save", num_labels=len(label_list))
model_MSE = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_MSE_save", num_labels=len(label_list))

BERTtrainer_CE = CustomBERTTrainerCrossEntropy(
    model_CE,
    args_CE,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_MLSML = CustomBERTTrainerMLSML(
    model_MLSML,
    args_MLSML,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_KLDiv = CustomBERTTrainerKLDiv(
    model_KLDiv,
    args_KLDiv,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_MSE = CustomBERTTrainerMSE(
    model_MSE,
    args_MSE,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Evaluation

In [41]:
CE_eval = BERTtrainer_CE.evaluate()
print(CE_eval)
MLSML_eval = BERTtrainer_MLSML.evaluate()
print(MLSML_eval)
KLDiv_eval = BERTtrainer_KLDiv.evaluate()
print(KLDiv_eval)
MSE_eval = BERTtrainer_MSE.evaluate()
print(MSE_eval)

              precision    recall  f1-score   support

         B-O       0.96      0.95      0.96      5197
        B-AC       0.81      0.83      0.82       563
        B-LF       0.72      0.74      0.73       290
        I-LF       0.77      0.86      0.81       487

    accuracy                           0.92      6537
   macro avg       0.81      0.85      0.83      6537
weighted avg       0.92      0.92      0.92      6537

{'eval_loss': 0.10738085955381393, 'eval_precision': 0.9305578684429642, 'eval_recall': 0.9231785891293574, 'eval_f1': 0.9268535412174491, 'eval_accuracy': 0.9215236346948141, 'eval_runtime': 1.7711, 'eval_samples_per_second': 71.143, 'eval_steps_per_second': 71.143}


              precision    recall  f1-score   support

         B-O       0.96      0.95      0.95      5197
        B-AC       0.79      0.83      0.81       563
        B-LF       0.73      0.74      0.74       290
        I-LF       0.76      0.86      0.81       487

    accuracy                           0.92      6537
   macro avg       0.81      0.84      0.83      6537
weighted avg       0.92      0.92      0.92      6537

{'eval_loss': 0.11228480190038681, 'eval_precision': 0.9287735064070561, 'eval_recall': 0.9220221377829175, 'eval_f1': 0.9253855082075941, 'eval_accuracy': 0.920299831727092, 'eval_runtime': 1.7679, 'eval_samples_per_second': 71.272, 'eval_steps_per_second': 71.272}


              precision    recall  f1-score   support

         B-O       0.97      0.95      0.96      5197
        B-AC       0.78      0.85      0.81       563
        B-LF       0.75      0.78      0.77       290
        I-LF       0.77      0.85      0.81       487

    accuracy                           0.92      6537
   macro avg       0.82      0.85      0.84      6537
weighted avg       0.93      0.92      0.92      6537

{'eval_loss': 0.10572151839733124, 'eval_precision': 0.9312178102674863, 'eval_recall': 0.9259871138278539, 'eval_f1': 0.928595096090126, 'eval_accuracy': 0.9235123145173627, 'eval_runtime': 1.7687, 'eval_samples_per_second': 71.237, 'eval_steps_per_second': 71.237}


              precision    recall  f1-score   support

         B-O       0.97      0.95      0.96      5197
        B-AC       0.81      0.86      0.83       563
        B-LF       0.73      0.76      0.74       290
        I-LF       0.76      0.85      0.80       487

    accuracy                           0.92      6537
   macro avg       0.82      0.85      0.83      6537
weighted avg       0.93      0.92      0.93      6537

{'eval_loss': 0.1143692210316658, 'eval_precision': 0.9332558525651669, 'eval_recall': 0.9286304311911449, 'eval_f1': 0.9309373964889035, 'eval_accuracy': 0.9248890928560501, 'eval_runtime': 1.769, 'eval_samples_per_second': 71.227, 'eval_steps_per_second': 71.227}


# Experiment 3 (Additional Training Samples from Optional Dataset)

lemmatization with Bag of Words VS Word2Vec

## Collecting Dataset

In [46]:
filtered_dataset = load_dataset("surrey-nlp/PLOD-filtered")

In [47]:
filtered_train = filtered_dataset["train"]
print(f"train size: {len(filtered_train)}")
filtered_val = filtered_dataset["validation"]
print(f"val size: {len(filtered_val)}")
filtered_test = filtered_dataset["test"]
print(f"test size: {len(filtered_test)}")

train size: 112652
val size: 24140
test size: 24140


## Extracting Data to Use
I will have three tests, using three sizes of data acquired from the filtered dataset.

small: 1072 extra samples to double the dataset size  
medium: 10000 extra samples  
large: 50000 extra samples

In [48]:
def decode_tags(tag_sequences, possible_tags):
    """
    Decodes a sequence of numerical tags into a list of corresponding textual labels.

    Args:
        tag_sequence: A list of integers representing numerical tags.
        possible_tags: A list of strings representing the possible textual labels.

    Returns:
        A list of strings representing the decoded textual tags.
    """

    decoded_tags = [[possible_tags[tag] for tag in row] for row in tag_sequences]
    return decoded_tags


def build_dataset(filtered_set, cw_set, num_of_samples):
    """
    Merges a specified number of rows from a larger list to a smaller list, ensuring no duplicates.

    Args:
        filtered_set: a split of the filtered dataset
        cw_set: a split of the cw dataset
        num_of_samples: The number of rows to add from the filtered set.

    Returns:
        new tokens, pos_tags and ner_tags lists
    """
    # set up the initial lists
    tokens = cw_set["tokens"]
    pos_tags = cw_set["pos_tags"]
    ner_tags = cw_set["ner_tags"]
     
    # set up the filtered lists
    # tokens
    filtered_tokens = filtered_set["tokens"]
    # pos_tags
    filtered_label_list = filtered_set.features[f"pos_tags"].feature.names
    filtered_pos_tags = decode_tags(filtered_set["pos_tags"], filtered_label_list)
    # ner_tags
    filtered_label_list = filtered_set.features[f"ner_tags"].feature.names
    filtered_ner_tags = decode_tags(filtered_set["ner_tags"], filtered_label_list)

    # convert the tokens list to sets for efficient duplicate checking
    tokens_set = set(tuple(row) for row in tokens)
    filtered_tokens_set = set(tuple(row) for row in filtered_tokens)

    # find rows to add
    rows_to_add = []
    for index, row in enumerate(filtered_tokens_set):
        if tuple(row) not in tokens_set and len(rows_to_add) < num_of_samples:
            rows_to_add.append(index)

    # Merge and return the lists
    tokens = tokens + [filtered_tokens[i] for i in rows_to_add]
    pos_tags = pos_tags + [filtered_pos_tags[i] for i in rows_to_add]
    ner_tags = ner_tags + [filtered_ner_tags[i] for i in rows_to_add]

    return tokens, pos_tags, ner_tags




In [49]:
small = 1072
medium = 10000
large = 50000
tokens_small, pos_tags_small, ner_tags_small = build_dataset(filtered_train, train, small)
tokens_small = pre_process_data(tokens_small, pos_tags_small)
tokens_medium, pos_tags_medium, ner_tags_medium = build_dataset(filtered_train, train, medium)
tokens_medium = pre_process_data(tokens_medium, pos_tags_medium)
tokens_large, pos_tags_large, ner_tags_large = build_dataset(filtered_train, train, large)
tokens_large = pre_process_data(tokens_large, pos_tags_large)
print(f"num of small samples: {len(tokens_small)}\nnum of medium samples: {len(tokens_medium)}\nnum of large samples: {len(tokens_large)}")

num of small samples: 2144
num of medium samples: 11072
num of large samples: 51072


## Training

### HMM Training

In [None]:
# create the character sets
char_set_small = get_char_set(tokens_small)
char_set_medium = get_char_set(tokens_medium)
char_set_large = get_char_set(tokens_large)

# create trainers
#small
trainer_small = nltk.tag.hmm.HiddenMarkovModelTrainer(states=label_list, symbols=char_set_small)
data_small = combine_lists_elementwise(tokens_small.copy(), ner_tags_small.copy())
# medium
trainer_medium = nltk.tag.hmm.HiddenMarkovModelTrainer(states=label_list, symbols=char_set_medium)
data_medium = combine_lists_elementwise(tokens_medium.copy(), ner_tags_medium.copy())
# large
trainer_large = nltk.tag.hmm.HiddenMarkovModelTrainer(states=label_list, symbols=char_set_large)
data_large = combine_lists_elementwise(tokens_large.copy(), ner_tags_large.copy())

In [None]:
model_small = trainer_small.train_supervised(data_small)
model_medium = trainer_medium.train_supervised(data_medium)
model_large = trainer_large.train_supervised(data_large)

In [None]:
save_hmm(model_small, "hmm_model_small.dill")
save_hmm(model_medium, "hmm_model_medium.dill")
save_hmm(model_large, "hmm_model_large.dill")

In [None]:
model_small = load_hmm("hmm_model_small.dill")
model_medium = load_hmm("hmm_model_medium.dill")
model_large = load_hmm("hmm_model_large.dill")

### BERT Training

In [50]:
# create 3 datasets
from datasets import DatasetDict, Dataset

small_datasets_dict = {
    "train": Dataset.from_dict({"tokens": tokens_small, "pos_tags": pos_tags_small, "ner_tags": ner_tags_small})
}
medium_datasets_dict = {
    "train": Dataset.from_dict({"tokens": tokens_medium, "pos_tags": pos_tags_medium, "ner_tags": ner_tags_medium})
}
large_datasets_dict = {
    "train": Dataset.from_dict({"tokens": tokens_large, "pos_tags": pos_tags_large, "ner_tags": ner_tags_large})
}

small_dataset = DatasetDict(small_datasets_dict)
medium_dataset = DatasetDict(medium_datasets_dict)
large_dataset = DatasetDict(large_datasets_dict)

In [51]:
small_tokenized_dataset = small_dataset.map(tokenize_and_align_labels, batched=True)
medium_tokenized_dataset = medium_dataset.map(tokenize_and_align_labels, batched=True)
large_tokenized_dataset = large_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2144 [00:00<?, ? examples/s]

Map:   0%|          | 0/11072 [00:00<?, ? examples/s]

Map:   0%|          | 0/51072 [00:00<?, ? examples/s]

In [52]:
import tensorboard
args_small = TrainingArguments(
    f"{model_name}-finetuned-small-{task}",
    evaluation_strategy ='steps',
    eval_steps = 20,
    logging_steps = 20,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)
args_medium = TrainingArguments(
    f"{model_name}-finetuned-medium-{task}",
    evaluation_strategy ='steps',
    eval_steps = 50,
    logging_steps = 50,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)
args_large = TrainingArguments(
    f"{model_name}-finetuned-large-{task}",
    evaluation_strategy ='steps',
    eval_steps = 300,
    logging_steps = 300,
    save_total_limit = 1,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

model_small = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_medium = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_large = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))


BERTtrainer_small = Trainer(
    model_small,
    args_small,
    train_dataset=small_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_medium = Trainer(
    model_medium,
    args_medium,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_large = Trainer(
    model_large,
    args_large,
    train_dataset=large_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: 

In [None]:
# BERTtrainer_small = None
# BERTtrainer_medium = None
# BERTtrainer_large = None
# BERTtrainer = None
torch.cuda.empty_cache()
gc.collect()

In [None]:
BERTtrainer_small.train()
BERTtrainer_small.model.save_pretrained("/kaggle/working/BERT_small_save")
BERTtrainer_small = None
torch.cuda.empty_cache()
gc.collect()

BERTtrainer_medium.train()
BERTtrainer_medium.model.save_pretrained("/kaggle/working/BERT_medium_save")
BERTtrainer_medium = None
torch.cuda.empty_cache()
gc.collect()

gc.collect()
torch.cuda.empty_cache()
BERTtrainer_large.train()
BERTtrainer_large.model.save_pretrained("/kaggle/working/BERT_large_save")
BERTtrainer_large = None
torch.cuda.empty_cache()
gc.collect()

## evaluation

### HMM Evaluation

In [40]:
model_small = load_hmm("hmm_model_small.dill")
model_medium = load_hmm("hmm_model_medium.dill")
model_large = load_hmm("hmm_model_large.dill")

Model loaded successfully!
Model loaded successfully!
Model loaded successfully!


In [41]:
predicted_small = evaluate_hmm(model_small, test_sentences)
predicted_medium = evaluate_hmm(model_medium, test_sentences)
predicted_large = evaluate_hmm(model_large, test_sentences)

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  P[i] = self._priors.logprob(si)
  O[i, k] = self._output_logprob(si, self._symbols[k])


In [42]:
predicted_small = [item for sublist in predicted_small for item in sublist]
predicted_medium = [item for sublist in predicted_medium for item in sublist]
predicted_large = [item for sublist in predicted_large for item in sublist]

print(len(correct_tags))
print(len(predicted_small))
print(len(predicted_medium))
print(len(predicted_large))


5000
5000
5000
5000


In [43]:
print(classification_report(correct_tags, predicted_small, labels = label_list))
print(classification_report(correct_tags, predicted_medium, labels = label_list))
print(classification_report(correct_tags, predicted_large, labels = label_list))

              precision    recall  f1-score   support

         B-O       0.87      0.98      0.92      4261
        B-AC       0.84      0.18      0.29       263
        B-LF       0.51      0.15      0.24       149
        I-LF       0.58      0.16      0.25       327

    accuracy                           0.86      5000
   macro avg       0.70      0.37      0.43      5000
weighted avg       0.84      0.86      0.83      5000

              precision    recall  f1-score   support

         B-O       0.90      0.97      0.94      4261
        B-AC       0.86      0.37      0.51       263
        B-LF       0.57      0.32      0.41       149
        I-LF       0.65      0.39      0.49       327

    accuracy                           0.88      5000
   macro avg       0.74      0.51      0.58      5000
weighted avg       0.87      0.88      0.87      5000

              precision    recall  f1-score   support

         B-O       0.94      0.97      0.96      4261
        B-AC       0.

In [None]:
predicted = evaluate_hmm(model, test_sentences)
predicted = [item for sublist in predicted for item in sublist]
print(classification_report(correct_tags, predicted, labels = label_list))

### BERT Evaluation

In [56]:
model_small = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_small_save", num_labels=len(label_list))
model_medium = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_medium_save", num_labels=len(label_list))
model_large = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_large_save", num_labels=len(label_list))


BERTtrainer_small = Trainer(
    model_small,
    args_small,
    train_dataset=small_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_medium = Trainer(
    model_medium,
    args_medium,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_large = Trainer(
    model_large,
    args_large,
    train_dataset=large_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# BERTtrainer_small.model.from_pretrained("model_saves/BERT_small_save")
# BERTtrainer_medium.model.from_pretrained("model_saves/BERT_medium_save")
# BERTtrainer_large.model.from_pretrained("model_saves/BERT_large_save")

small_metrics = BERTtrainer_small.evaluate()
print(small_metrics)
medium_metrics = BERTtrainer_medium.evaluate()
print(medium_metrics)
large_metrics = BERTtrainer_large.evaluate()
print(large_metrics)

  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.96      0.94      0.95      5197
        B-AC       0.74      0.82      0.78       563
        B-LF       0.66      0.66      0.66       290
        I-LF       0.72      0.83      0.77       487

    accuracy                           0.91      6537
   macro avg       0.77      0.81      0.79      6537
weighted avg       0.91      0.91      0.91      6537

{'eval_loss': 0.26455628871917725, 'eval_precision': 0.9140572951365756, 'eval_recall': 0.906657855608789, 'eval_f1': 0.9103425396035498, 'eval_accuracy': 0.9062260975982867, 'eval_runtime': 2.2217, 'eval_samples_per_second': 56.715, 'eval_steps_per_second': 56.715}


  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.97      0.95      0.96      5197
        B-AC       0.82      0.87      0.84       563
        B-LF       0.76      0.83      0.79       290
        I-LF       0.78      0.89      0.83       487

    accuracy                           0.93      6537
   macro avg       0.83      0.88      0.86      6537
weighted avg       0.94      0.93      0.93      6537

{'eval_loss': 0.1811598688364029, 'eval_precision': 0.9424904150691782, 'eval_recall': 0.9340822732529325, 'eval_f1': 0.9382675074676402, 'eval_accuracy': 0.933149762888175, 'eval_runtime': 2.1692, 'eval_samples_per_second': 58.087, 'eval_steps_per_second': 58.087}


  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.98      0.96      0.97      5197
        B-AC       0.88      0.90      0.89       563
        B-LF       0.78      0.87      0.82       290
        I-LF       0.80      0.92      0.86       487

    accuracy                           0.94      6537
   macro avg       0.86      0.91      0.88      6537
weighted avg       0.95      0.94      0.95      6537

{'eval_loss': 0.1521802544593811, 'eval_precision': 0.9544924154025671, 'eval_recall': 0.9459772013877417, 'eval_f1': 0.9502157318287422, 'eval_accuracy': 0.9444699403396053, 'eval_runtime': 2.2088, 'eval_samples_per_second': 57.044, 'eval_steps_per_second': 57.044}


# Experiment 4 (Hyperparameters)

## Arguments For Each Test

In [59]:
args_one = TrainingArguments(
    f"{model_name}-finetuned-one-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_two = TrainingArguments(
    f"{model_name}-finetuned-two-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_three = TrainingArguments(
    f"{model_name}-finetuned-three-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.0001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_four = TrainingArguments(
    f"{model_name}-finetuned-four-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)


args_five = TrainingArguments(
    f"{model_name}-finetuned-five-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.000001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_six = TrainingArguments(
    f"{model_name}-finetuned-six-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_seven = TrainingArguments(
    f"{model_name}-finetuned-seven-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.5,
    adam_beta2=0.5,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_eight = TrainingArguments(
    f"{model_name}-finetuned-eight-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.2,
    adam_beta2=0.2,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

args_nine = TrainingArguments(
    f"{model_name}-finetuned-nine-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.1,
    adam_beta2=0.9,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)


args_ten = TrainingArguments(
    f"{model_name}-finetuned-ten-{task}",
    evaluation_strategy ='steps',
    eval_steps = 100,
    logging_steps = 100,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.9,
    adam_beta2=0.1,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

In [None]:
model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

## Training

In [None]:
import gc
BERTtrainer_ex_4 = None
torch.cuda.empty_cache()
gc.collect()

In [None]:
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_one,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_one_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_two,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_two_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_four,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_four_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()


# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_five,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_five_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_seven,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_seven_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_eight,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_eight_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

# model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
# BERTtrainer_ex_4 = Trainer(
#     model_ex_4,
#     args_nine,
#     train_dataset=medium_tokenized_dataset["train"],
#     eval_dataset=tokenized_datasets["validation"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
# )

# BERTtrainer_ex_4.train()
# BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_nine_save")
# BERTtrainer_ex_4 = None
# torch.cuda.empty_cache()
# gc.collect()

model_ex_4 = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
BERTtrainer_ex_4 = Trainer(
    model_ex_4,
    args_ten,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4.train()
BERTtrainer_ex_4.model.save_pretrained("/kaggle/working/BERT_ex4_ten_save")
BERTtrainer_ex_4 = None
torch.cuda.empty_cache()
gc.collect()

## Evaluation

In [60]:
model_ex_4_one = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_one_save", num_labels=len(label_list))
model_ex_4_two = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_two_save", num_labels=len(label_list))
model_ex_4_three = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_three_save", num_labels=len(label_list))
model_ex_4_four = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_four_save", num_labels=len(label_list))
model_ex_4_five = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_five_save", num_labels=len(label_list))
model_ex_4_six = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_six_save", num_labels=len(label_list))
model_ex_4_seven = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_seven_save", num_labels=len(label_list))
model_ex_4_eight = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_eight_save", num_labels=len(label_list))
model_ex_4_nine = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_nine_save", num_labels=len(label_list))
model_ex_4_ten = AutoModelForTokenClassification.from_pretrained("model_saves/BERT_ex4_ten_save", num_labels=len(label_list))

BERTtrainer_ex_4_one = Trainer(
    model_ex_4_one,
    args_one,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_two = Trainer(
    model_ex_4_two,
    args_two,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_three = Trainer(
    model_ex_4_three,
    args_three,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_four = Trainer(
    model_ex_4_four,
    args_four,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_five = Trainer(
    model_ex_4_five,
    args_five,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_six = Trainer(
    model_ex_4_six,
    args_six,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_seven = Trainer(
    model_ex_4_seven,
    args_seven,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_eight = Trainer(
    model_ex_4_eight,
    args_eight,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_nine = Trainer(
    model_ex_4_nine,
    args_nine,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

BERTtrainer_ex_4_ten = Trainer(
    model_ex_4_ten,
    args_ten,
    train_dataset=medium_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [61]:
metrics = BERTtrainer_ex_4_one.evaluate()
print(metrics)



  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.80      1.00      0.89      5197
        B-AC       0.00      0.00      0.00       563
        B-LF       0.00      0.00      0.00       290
        I-LF       0.00      0.00      0.00       487

    accuracy                           0.80      6537
   macro avg       0.20      0.25      0.22      6537
weighted avg       0.63      0.80      0.70      6537

{'eval_loss': 0.7786154747009277, 'eval_precision': 0.7950130029065321, 'eval_recall': 0.8585825210639353, 'eval_f1': 0.8255758538522637, 'eval_accuracy': 0.7950130029065321, 'eval_runtime': 2.815, 'eval_samples_per_second': 44.76, 'eval_steps_per_second': 44.76}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [62]:
metrics = BERTtrainer_ex_4_two.evaluate()
print(metrics)



  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.80      1.00      0.89      5197
        B-AC       0.00      0.00      0.00       563
        B-LF       0.00      0.00      0.00       290
        I-LF       0.00      0.00      0.00       487

    accuracy                           0.80      6537
   macro avg       0.20      0.25      0.22      6537
weighted avg       0.63      0.80      0.70      6537

{'eval_loss': 0.7777854204177856, 'eval_precision': 0.7950130029065321, 'eval_recall': 0.8585825210639353, 'eval_f1': 0.8255758538522637, 'eval_accuracy': 0.7950130029065321, 'eval_runtime': 2.4648, 'eval_samples_per_second': 51.12, 'eval_steps_per_second': 51.12}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [63]:
metrics = BERTtrainer_ex_4_three.evaluate()
print(metrics)



  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.97      0.96      0.96      5197
        B-AC       0.88      0.87      0.87       563
        B-LF       0.76      0.82      0.79       290
        I-LF       0.79      0.91      0.84       487

    accuracy                           0.94      6537
   macro avg       0.85      0.89      0.87      6537
weighted avg       0.94      0.94      0.94      6537

{'eval_loss': 0.16653192043304443, 'eval_precision': 0.9499666444296198, 'eval_recall': 0.9410209813315711, 'eval_f1': 0.9454726533322267, 'eval_accuracy': 0.9394217530977512, 'eval_runtime': 4.015, 'eval_samples_per_second': 31.382, 'eval_steps_per_second': 31.382}


In [64]:
metrics = BERTtrainer_ex_4_four.evaluate()
print(metrics)



  0%|          | 0/126 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         B-O       0.97      0.95      0.96      5197
        B-AC       0.82      0.87      0.84       563
        B-LF       0.73      0.80      0.76       290
        I-LF       0.78      0.90      0.83       487

    accuracy                           0.93      6537
   macro avg       0.83      0.88      0.85      6537
weighted avg       0.93      0.93      0.93      6537

{'eval_loss': 0.18823082745075226, 'eval_precision': 0.9395, 'eval_recall': 0.9312737485544358, 'eval_f1': 0.9353687878536463, 'eval_accuracy': 0.9308551323236959, 'eval_runtime': 8.9061, 'eval_samples_per_second': 14.148, 'eval_steps_per_second': 14.148}


In [None]:
metrics = BERTtrainer_ex_4_five.evaluate()
print(metrics)



In [None]:
metrics = BERTtrainer_ex_4_six.evaluate()
print(metrics)



In [None]:
metrics = BERTtrainer_ex_4_seven.evaluate()
print(metrics)



In [None]:
metrics = BERTtrainer_ex_4_eight.evaluate()
print(metrics)



In [None]:
metrics = BERTtrainer_ex_4_nine.evaluate()
print(metrics)



In [None]:
metrics = BERTtrainer_ex_4_ten.evaluate()
print(metrics)

# Final implementation

In [65]:
# KLDivLoss
class CustomBERTTrainerKLDiv(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = nn.KLDivLoss()

    def compute_loss(self, model, inputs, return_outputs=False):
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
            loss = loss*loss
            loss = loss.mean()

        return (loss, outputs) if return_outputs else loss

args_final = TrainingArguments(
    f"{model_name}-finetuned-final-{task}",
    evaluation_strategy ='steps',
    eval_steps = 1000,
    logging_steps = 1000,
    save_total_limit = 1,
    learning_rate=0.00001,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.001,
    save_steps=0,
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    report_to=["tensorboard"],
)

model_final = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

BERTtrainer_final = CustomBERTTrainerKLDiv(
    model_final,
    args_final,
    train_dataset=large_tokenized_dataset["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
import gc
BERTtrainer_final = None
torch.cuda.empty_cache()
gc.collect()

45

In [66]:
BERTtrainer_final.train()
#BERTtrainer_final.model.save_pretrained("/kaggle/working/BERT_final_save")

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1000,0.103,0.070246,0.939409,0.929787,0.934573,0.930702
2000,0.0528,0.054938,0.944638,0.941517,0.943075,0.938504
3000,0.0486,0.065105,0.940351,0.929787,0.935039,0.931161
4000,0.043,0.058449,0.952127,0.943003,0.947543,0.942787
5000,0.039,0.051703,0.950708,0.943169,0.946923,0.942022
6000,0.0387,0.05233,0.951147,0.945647,0.948389,0.942175


              precision    recall  f1-score   support

         B-O       0.98      0.95      0.96      5197
        B-AC       0.83      0.87      0.85       563
        B-LF       0.70      0.83      0.76       290
        I-LF       0.77      0.90      0.83       487

    accuracy                           0.93      6537
   macro avg       0.82      0.89      0.85      6537
weighted avg       0.94      0.93      0.93      6537

              precision    recall  f1-score   support

         B-O       0.97      0.96      0.97      5197
        B-AC       0.89      0.83      0.86       563
        B-LF       0.78      0.81      0.80       290
        I-LF       0.80      0.87      0.83       487

    accuracy                           0.94      6537
   macro avg       0.86      0.87      0.86      6537
weighted avg       0.94      0.94      0.94      6537

              precision    recall  f1-score   support

         B-O       0.98      0.94      0.96      5197
        B-AC       0.

TrainOutput(global_step=6384, training_loss=0.0533039795426199, metrics={'train_runtime': 876.0279, 'train_samples_per_second': 58.3, 'train_steps_per_second': 7.287, 'total_flos': 3593319985147392.0, 'train_loss': 0.0533039795426199, 'epoch': 1.0})

In [68]:
BERTtrainer_final.model.save_pretrained("/kaggle/working/BERT_final_save")

In [67]:
BERTtrainer_final.evaluate()

              precision    recall  f1-score   support

         B-O       0.98      0.96      0.97      5197
        B-AC       0.88      0.90      0.89       563
        B-LF       0.77      0.87      0.82       290
        I-LF       0.80      0.90      0.85       487

    accuracy                           0.94      6537
   macro avg       0.86      0.91      0.88      6537
weighted avg       0.95      0.94      0.94      6537



{'eval_loss': 0.053110260516405106,
 'eval_precision': 0.9521276595744681,
 'eval_recall': 0.946307616058153,
 'eval_f1': 0.9492087165465242,
 'eval_accuracy': 0.9432461373718831,
 'eval_runtime': 1.886,
 'eval_samples_per_second': 66.808,
 'eval_steps_per_second': 66.808,
 'epoch': 1.0}