## Requirements

In [None]:
# For data processing
!pip install numpy pandas tqdm

# For training the model
!pip install torch

# For the loading datasets and PLMs
!pip install huggingface datasets transformers

# For evaluating the model
!pip install seqeval scikit-learn

Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)

## Specify Model and Data

Modifications to the ft variable defining the features metadata may be required for other datasets

In [None]:
import os

model_name = "kbir" # Model Saving Name
model_ckpt = "bloomberg/KBIR" # Huggingface Model Dir
dataset_name = "semeval2010" # Dataset Saving Name
data_ckpt = "midas/semeval2010" # Huggingface Dataset Dir

# Create data folder if not existant
if not os.path.isdir(rf".\data\{dataset_name}"):
    os.mkdir(rf".\data\{dataset_name}")

## Load Data

In [None]:
import os
from datasets import ClassLabel, Features, Sequence, Value, load_dataset, load_from_disk

# Define a list of class names
class_names = ["B", "I", "O"]

# Define the features for the dataset
ft = Features(
    {
        "id": Value(dtype="string"), # 'id' feature of type string
        "document": Sequence(feature=Value(dtype="string", id=None)), # 'document' feature as a sequence of strings
        "doc_bio_tags": Sequence(ClassLabel(names=class_names)), # 'doc_bio_tags' feature as a sequence of class labels
    }
)

# Load the dataset from the specified checkpoint path with the given features and using 4 processes
data = load_dataset(
    data_ckpt, "extraction", features=ft, num_proc=4
)

Downloading builder script:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/38.1k [00:00<?, ?B/s]

Downloading and preparing dataset semeval2010/extraction to /root/.cache/huggingface/datasets/midas___semeval2010/extraction-e27051d91c65c33f/0.0.1/557ceb28d01a0c1a7d1f26fbfd46e2c6285979264b3880b40d2b2234dac51443...




  

Downloading data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

  

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]



Generating train split: 0 examples [00:00, ? examples/s]



Generating test split: 0 examples [00:00, ? examples/s]

Dataset semeval2010 downloaded and prepared to /root/.cache/huggingface/datasets/midas___semeval2010/extraction-e27051d91c65c33f/0.0.1/557ceb28d01a0c1a7d1f26fbfd46e2c6285979264b3880b40d2b2234dac51443. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Tokenization

In [None]:
import torch
from datasets import load_from_disk
from tokenizers import AddedToken
from transformers import AutoModel, AutoTokenizer

# Load Model Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, add_prefix_space=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    # Tokenize the input examples without truncation and without padding
    tokenized_inputs = tokenizer(
        examples["document"], truncation=False, is_split_into_words=True, padding=False
    )

    # Initialize empty lists for labels, word IDs, and graph embeddings
    labels = []
    word_ids_list = []

    # Iterate over each example
    for idx, label in enumerate(examples["doc_bio_tags"]):
        # Get the word IDs for the current example
        word_ids = tokenized_inputs.word_ids(batch_index=idx)

        # Initialize variables for label IDs
        previous_word_idx = None
        label_ids = []

        # Iterate over each word index
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                # If the word index is None or equal to the previous word index,
                # assign a special label ID (-100)
                label_ids.append(-100)
            else:
                # Assign the label ID from the input examples
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx

        # Append the label IDs and word IDs to their respective lists
        labels.append(label_ids)
        word_ids_list.append(word_ids)

    # Add the word IDs and labels to the tokenized inputs
    tokenized_inputs["word_ids"] = word_ids_list
    tokenized_inputs["labels"] = labels

    # Return the tokenized inputs
    return tokenized_inputs

# Map the tokenize_and_align_labels function to the data in batches
data = data.map(tokenize_and_align_labels, batched=True, batch_size=5)

# Save the preprocessed data to disk
data.save_to_disk(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}"
)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/144 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

## Split and pad Input IDs and Labels to fit PLM

In [None]:
import pickle
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

# Load Data
data = load_from_disk(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}"
)

# Specify Model Maximum Number of input Token
max_token = 512

# Required Dicts
input_ids = {i: [] for i in data.keys()}
input_ids_splitted = {i: [] for i in data.keys()}
sentence_ids = {i: [] for i in data.keys()}
labels = {i: [] for i in data.keys()}
labels_splitted = {i: [] for i in data.keys()}

# Iterate over the splits in the data
for split in data.keys():
    input_ids[split] = data[split]["input_ids"]
    labels[split] = data[split]["labels"]

    # Iterate over each sample in the split
    for idx, sample in tqdm(enumerate(data[split]), total=len(data[split])):
        # Input Ids
        ## Split to fit PLM
        temp = [
            sample["input_ids"][i : i + max_token]
            for i in range(0, len(sample["input_ids"]), max_token)
        ]
        ## Pad the last sequence with padding token
        temp[-1] += [0 for i in range(max_token - len(temp[-1]))]
        input_ids_splitted[split] += temp
        sentence_ids[split] += [idx for _ in range(len(temp))]

        # Labels
        ## Split to fit PLM
        temp = [
            sample["labels"][i : i + max_token]
            for i in range(0, len(sample["labels"]), max_token)
        ]
        ## Pad the last sequence with -100 to make sure the tokens
        ## are not taken into account in the loss computation
        temp[-1] += [-100 for i in range(max_token - len(temp[-1]))]
        labels_splitted[split] += temp


# Save the dictionaries to disk using pickle
with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(input_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_splitted_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(input_ids_splitted, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_sentence_ids_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(sentence_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_splitted_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(labels_splitted, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 144/144 [00:04<00:00, 30.13it/s]
100%|██████████| 100/100 [00:03<00:00, 31.09it/s]


## Train Model

In [None]:
import copy
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from seqeval.metrics import accuracy_score, f1_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import AdamW, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput

### Dataset Class

In [None]:
class LongDataset(Dataset):
    def __init__(self, input_ids, labels, sentence_ids):
        # Initialize the dataset with the input data
        self.input_ids = input_ids
        self.labels = labels
        self.sentence_ids = sentence_ids

    def __len__(self):
        # Return the length of the dataset
        return len(self.sentence_ids)

    def __getitem__(self, idx):
        # Get an item from the dataset at the given index
        item = {}

        # Assign the input IDs as a long tensor
        item["input_ids"] = torch.tensor(self.input_ids[idx]).long()

        # Assign the labels as a long tensor
        item["labels"] = torch.tensor(self.labels[idx]).long()

        # Assign the sentence IDs
        item["sentence_ids"] = self.sentence_ids[idx]

        # Return the item
        return item

### Model Class

In [None]:
class SequenceTagger(nn.Module):
    def __init__(self):
        super().__init__()

        # Initialize the model with a pretrained language model
        self.plm = AutoModel.from_pretrained(model_ckpt)

        # Enable gradient computation for the language model parameters
        for param in self.plm.parameters():
            param.requires_grad = True

        # Dropout layer for regularization
        self.dropout = torch.nn.Dropout(0.05)

        # Linear layer for classification
        self.linear = torch.nn.Linear(1024, 3)
        self.init_linear_weights(self.linear)

    def init_linear_weights(self, m):
        # Initialize the weights of a linear layer
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0)
        return

    def forward(self, input_ids=None, graph_embeddings=None, labels=None, **kwargs):
        # Forward pass of the model

        # Pass the input_ids through the pretrained language model
        x_plm = self.plm(input_ids)[0]

        # Perform classification using a linear layer
        sequence_output = self.dropout(x_plm)
        logits = self.linear(sequence_output)

        loss = None
        if labels is not None:
            # Compute the loss if labels are provided
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 3), labels.view(-1))

        # Return the model output as a `TokenClassifierOutput` object
        return TokenClassifierOutput(
            loss=loss, logits=logits, hidden_states=None, attentions=None
        )

### Load Preprocessed Data

In [None]:
import pickle

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_splitted_dictionary.pickle",
    "rb",
) as handle:
    input_ids = pickle.load(handle)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_sentence_ids_dictionary.pickle",
    "rb",
) as handle:
    sentence_ids = pickle.load(handle)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_splitted_dictionary.pickle",
    "rb",
) as handle:
    labels = pickle.load(handle)

training_data = LongDataset(input_ids["train"], labels["train"], sentence_ids["train"])
test_data = LongDataset(input_ids["test"], labels["test"], sentence_ids["test"])

### Test Function

In [None]:
def test_f1(device, model, test_loader):
    # Mapping of tag index to tag label
    index2tag = {0: "B", 1: "I", 2: "O"}

    # Set the model in evaluation mode
    model.eval()

    # Initialize lists to store true and predicted labels for each sample
    total_true = [[] for _ in range(len(test_loader.dataset))]
    total_pred = [[] for _ in range(len(test_loader.dataset))]

    with torch.no_grad():
        # Iterate over the test data loader
        for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
            # Move input tensors and labels to the device
            input_ids = data["input_ids"].to(device)
            true = data["labels"].tolist()
            sentence_id = data["sentence_ids"].tolist()

            # Get the model's predicted tag scores
            tag_scores = model(input_ids=input_ids).logits

            # Convert predicted tag scores to labels
            output = torch.argmax(tag_scores, dim=2).cpu().tolist()

            # Clean true and predicted labels by removing special tokens (-100)
            true_cleaned = [[index2tag[i] for i in j if i != -100] for j in true]
            prediction_cleaned = [
                [index2tag[i] for i, j in zip(k, z) if j != -100]
                for k, z in zip(output, true)
            ]

            # Append true and predicted labels to the respective original sample lists
            for idx, sample_id in enumerate(sentence_id):
                total_true[sample_id] += true_cleaned[idx]
                total_pred[sample_id] += prediction_cleaned[idx]

    # Compute the F1 score using the true and predicted labels
    return f1_score(total_true, total_pred)

### Set Seed Function

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    return

### Training Loop

In [None]:
try:
    os.mkdir(rf".\results")
except:
    pass

try:
    os.mkdir(rf".\data\{dataset_name}\{model_name}")
except:
    pass

# Best model saving name
model_save_name = "Semeval-FineKBIR"

In [None]:
# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

# List of seeds to iterate over
seed_list = [7, 117, 777]

# Initialize a DataFrame to store the results
results_df = pd.DataFrame(columns=["Seed", "train_f1", "val_f1", "test_f1"])

# Iterate over the seeds
for i, seed in enumerate(seed_list):

    # Clear CUDA cache and delete the model if it exists
    torch.cuda.empty_cache()
    try:
        del model
    except:
        pass

    # Set Seed
    set_seed(seed)

    # Create a new instance of the SequenceTagger model
    model = SequenceTagger()

    # Move the model to the specified device
    model.to(device)

    # Create data loaders for training and testing
    train_loader = DataLoader(training_data, batch_size=1, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

    # Initialize an AdamW optimizer for training the model
    optimizer = AdamW(model.parameters(), lr=5e-5)
    best_f1 = 0

    # Train the model for 10 epochs
    for epoch in range(10):
        model.train()
        for batch in tqdm(train_loader, total=len(train_loader)):
            optimizer.zero_grad()
            # Move input tensors and labels to the device
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            # Perform a forward pass through the model and compute the loss
            outputs = model(input_ids, labels=labels, graph_embeddings=graph_embeddings)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

        # Evaluate the model's F1 score on the test set
        model.eval()
        current_f1 = test_f1(device, model, test_loader)

        # Update the best F1 score and save a copy of the best model
        print(f"Epoch: {epoch+1} - F1: {current_f1}")
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_model = copy.deepcopy(model)

    # Save the best model with the current seed
    torch.save(
        best_model,
        rf".\data\{dataset_name}\{model_name}\{model_save_name}_seed_{seed}",
    )

    # Calculate the F1 score on the training and test sets with the best model
    results_df.loc[i, "Seed"] = seed
    results_df.loc[i, "train_f1"] = test_f1(device, best_model, train_loader)
    results_df.loc[i, "test_f1"] = test_f1(device, best_model, test_loader)

    # Save the results to a CSV file
    results_df.to_csv(
        rf".\results\{model_save_name}.csv",
        index=False,
    )

Downloading (…)lve/main/config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of the model checkpoint at bloomberg/KBIR were not used when initializing RobertaModel: ['infilling_head.decoder.weight', 'replacement_classification_head.classifier.weight', 'infilling_head.mlp_layer_norm.linear2.weight', 'infilling_head.mlp_layer_norm.linear1.weight', 'replacement_classification_head.bias', 'lm_head.bias', 'lm_head.dense.bias', 'replacement_classification_head.classifier.bias', 'lm_head.decoder.bias', 'infilling_head.mlp_layer_norm.layer_norm1.weight', 'infilling_head.bias', 'infilling_head.mlp_layer_norm.linear2.bias', 'infilling_head.position_embeddings.weight', 'infilling_head.mlp_layer_norm.linear1.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'infilling_head.num_tok_classifier.bias', 'infilling_head.mlp_layer_norm.layer_norm2.bias', 'lm_head.dense.weight', 'infilling_head.num_tok_classifier.weight', 'infilling_head.mlp_layer_norm.layer_norm2.weight', 'infilling_head.mlp_layer_norm.layer_norm1.bias']
- This 

KeyboardInterrupt: ignored