## Requirements

In [None]:
# For data processing
!pip install numpy pandas tqdm

# For training the model
!pip install torch

# For the loading datasets and PLMs
!pip install huggingface datasets transformers

# For the Graph Embeddings
import os
import torch

os.environ["TORCH"] = torch.__version__
print(torch.__version__)

!pip install networkx textacy
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

# For evaluating the model
!pip install seqeval scikit-learn

## Specify Model and Data

In [None]:
import os

model_name = "kbir" # Model Saving Name
model_ckpt = "bloomberg/KBIR" # Huggingface Model Dir
dataset_name = "semeval2010" # Dataset Saving Name
data_ckpt = "midas/semeval2010" # Huggingface Dataset Dir

# Create data folder if not existant
if not os.path.isdir(rf".\data\{dataset_name}"):
    os.mkdir(rf".\data\{dataset_name}")

## Load Data

Modifications to the ft variable defining the features metadata may be required for other datasets


In [None]:
import os
from datasets import ClassLabel, Features, Sequence, Value, load_dataset, load_from_disk

# Define a list of class names
class_names = ["B", "I", "O"]

# Define the features for the dataset
ft = Features(
    {
        "id": Value(dtype="string"), # 'id' feature of type string
        "document": Sequence(feature=Value(dtype="string", id=None)), # 'document' feature as a sequence of strings
        "doc_bio_tags": Sequence(ClassLabel(names=class_names)), # 'doc_bio_tags' feature as a sequence of class labels
    }
)

# Load the dataset from the specified checkpoint path with the given features and using 4 processes
data = load_dataset(
    data_ckpt, "extraction", features=ft, num_proc=4
)

Downloading builder script:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/38.1k [00:00<?, ?B/s]

Downloading and preparing dataset semeval2010/extraction to /root/.cache/huggingface/datasets/midas___semeval2010/extraction-e27051d91c65c33f/0.0.1/557ceb28d01a0c1a7d1f26fbfd46e2c6285979264b3880b40d2b2234dac51443...




  

Downloading data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

  

Extracting data files #1:   0%|          | 0/1 [00:00<?, ?obj/s]

Extracting data files #0:   0%|          | 0/1 [00:00<?, ?obj/s]



Generating train split: 0 examples [00:00, ? examples/s]



Generating test split: 0 examples [00:00, ? examples/s]

Dataset semeval2010 downloaded and prepared to /root/.cache/huggingface/datasets/midas___semeval2010/extraction-e27051d91c65c33f/0.0.1/557ceb28d01a0c1a7d1f26fbfd46e2c6285979264b3880b40d2b2234dac51443. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Graph Embedding

### Construct Graph

In [None]:
import os
import random
from copy import copy, deepcopy

import networkx as nx
import numpy as np
import torch
import torch.nn.functional as F
import torch_geometric
import torch_geometric.transforms as T
from sklearn.metrics import accuracy_score, roc_auc_score
from textacy.representations import build_cooccurrence_network
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling
from tqdm import tqdm

In [None]:
class Net(torch.nn.Module):
    def __init__(self, n_nodes, embedding_size, hidden_channels, out_channels):
        super().__init__()

        # Initialize the attributes of the class
        self.n_nodes = n_nodes

        # Define an embedding layer for the input nodes
        self.embed = torch.nn.Embedding(n_nodes, embedding_size)

        # Define two graph convolutional layers
        self.conv1 = GCNConv(embedding_size, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index, edge_weight):
        # Encode the input nodes and their connections

        # Apply the embedding layer to the input nodes
        x = self.embed(x)

        # Apply the first graph convolutional layer and ReLU activation
        x = self.conv1(x, edge_index, edge_weight).relu()

        # Apply the second graph convolutional layer
        return self.conv2(x, edge_index, edge_weight)

    def decode(self, z, edge_label_index):
        # Decode the latent representation and calculate the edge scores

        # Multiply the latent representations of the relevant nodes element-wise and sum along the last dimension
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        # Decode the latent representation and determine the edges

        # Calculate the probabilistic adjacency matrix by multiplying the latent representation with its transpose
        prob_adj = z @ z.t()

        # Find the indices of the non-zero elements in the probabilistic adjacency matrix
        return (prob_adj > 0).nonzero(as_tuple=False).t()

In [None]:
def train(data, model, optimizer, criterion):
    # Set the model in training mode
    model.train()

    # Clear the gradients of the optimizer
    optimizer.zero_grad()

    # Encode the input data to obtain the latent representation
    z = model.encode(data.x, data.edge_index, data.edge_weight)

    # Perform negative sampling to create negative edge samples
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=4 * data.edge_label_index.size(1),
        method="sparse",
    )

    # Concatenate the positive edge indices with the negative edge indices
    edge_label_index = torch.cat(
        [data.edge_label_index, neg_edge_index],
        dim=-1,
    )

    # Concatenate the positive edge labels with zeros for the negative edge labels
    edge_label = torch.cat(
        [data.edge_label, data.edge_label.new_zeros(neg_edge_index.size(1))], dim=0
    )

    # Decode the latent representation and reshape the output
    out = model.decode(z, edge_label_index).view(-1)

    # Compute the loss between the predicted output and the edge labels
    loss = criterion(out, edge_label)

    # Compute the gradients
    loss.backward()

    # Update the model parameters
    optimizer.step()

    # Return the loss value
    return loss


@torch.no_grad()
def test(data, model):
    # Set the model in evaluation mode
    model.eval()

    # Perform negative sampling to create negative edge samples
    neg_edge_index = negative_sampling(
        edge_index=data.edge_index,
        num_nodes=data.num_nodes,
        num_neg_samples=data.edge_label_index.size(1),
        method="sparse",
    )

    # Concatenate the positive edge indices with the negative edge indices
    edge_label_index = torch.cat(
        [data.edge_label_index, neg_edge_index],
        dim=-1,
    )

    # Concatenate the positive edge labels with zeros for the negative edge labels
    edge_label = torch.cat(
        [data.edge_label, data.edge_label.new_zeros(neg_edge_index.size(1))], dim=0
    )

    # Encode the input data to obtain the latent representation
    z = model.encode(data.x, data.edge_index, data.edge_weight)

    # Decode the latent representation and reshape the output
    out = model.decode(z, edge_label_index).view(-1).sigmoid()

    # Compute the ROC AUC score between the edge labels and the predicted output
    return roc_auc_score(edge_label.cpu().numpy(), out.cpu().numpy())


def set_seed_pyg(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch_geometric.seed.seed_everything(seed)

    return

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
set_seed_pyg(0)


def calc_graph_embeddings(text):
    # Build Coocurency Graph
    G = build_cooccurrence_network(text, window_size=4)

    # Create lists of nodes and edges
    nodes = list(G.nodes())
    word_2_int = {k: v for k, v in zip(nodes, [*range(len(nodes))])}
    edges = list(G.edges())
    source_node = [*map(lambda x: nodes.index(x[0]), edges)]
    target_node = [*map(lambda x: nodes.index(x[1]), edges)]
    edge_weights = list(nx.get_edge_attributes(G, "weight").values())
    temp = copy(source_node)

    # Do all the edges bidirectional
    source_node.extend(target_node)
    target_node.extend(temp)
    edge_weights.extend(edge_weights)

    # Create tensors
    nodes_tensor = torch.tensor([*range(len(nodes))]).long()
    edge_tensor = torch.tensor([source_node, target_node]).long()
    edge_weights_tensor = torch.tensor(edge_weights).float()

    # Construct PyT Dataset
    data = Data(x=nodes_tensor, edge_index=edge_tensor, edge_weight=edge_weights_tensor)
    data.edge_label = torch.zeros(data.edge_index.shape[1]).float() + 1
    data.edge_label_index = data.edge_index
    data = data.to(device)

    # Create Model
    model = Net(data.x.shape[0], 768 // 4, 768 // 4, 768 // 4).to(device)
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
    criterion = torch.nn.BCEWithLogitsLoss()

    # Train model
    max_auc = 0
    for epoch in range(1, 6):
        loss = train(data, model, optimizer, criterion)
        auc = test(data, model)
        if auc > max_auc:
            max_auc = auc
            best_model = deepcopy(model)
            best_epoch = epoch

    # Compute node embeddings with the best perfroming model
    z = best_model.encode(data.x, data.edge_index, data.edge_weight)
    text_graph_embedding = (
        torch.stack([*map(lambda x: z[word_2_int[x], :], text)]).detach().cpu().numpy()
    )

    # Compute node embeddings with the best perfroming model
    return text_graph_embedding, best_epoch, max_auc

In [None]:
def map_calc_graph_embeddings(batch):
    text_graph_embedding, best_epoch, max_auc = calc_graph_embeddings(batch["document"])
    return {"graph_embedding": text_graph_embedding}


data = data.map(map_calc_graph_embeddings)
data.save_to_disk(
    rf".\data\{dataset_name}\{dataset_name}_with_graph_embeddings"
)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/144 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

## Tokenization + Graph Token Embeddings

In [None]:
import torch
from datasets import load_from_disk
from tokenizers import AddedToken
from transformers import AutoModel, AutoTokenizer

# Load Model Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, add_prefix_space=True)

# Load data with graph embeddings
data = load_from_disk(
    rf".\data\{dataset_name}\{dataset_name}_with_graph_embeddings"
)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    # Tokenize the input examples without truncation and without padding
    tokenized_inputs = tokenizer(
        examples["document"], truncation=False, is_split_into_words=True, padding=False
    )

    # Initialize empty lists for labels, word IDs, and graph embeddings
    labels = []
    word_ids_list = []
    graph_embeddings = []

    # Set the graph embeddings dimension
    gragh_size = 768 // 4

    # Iterate over each example
    for idx, (label, embedding) in enumerate(
        zip(examples["doc_bio_tags"], examples["graph_embedding"])
    ):
        # Get the word IDs for the current example
        word_ids = tokenized_inputs.word_ids(batch_index=idx)

        # Initialize variables for label IDs and sample graph embeddings
        previous_word_idx = None
        label_ids = []
        sample_graph_embeddings = []

        # Iterate over each word index
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                # If the word index is None or equal to the previous word index,
                # assign a special label ID (-100) and zero graph embeddings
                label_ids.append(-100)
                sample_graph_embeddings.append([0 for _ in range(gragh_size)])
            else:
                # Assign the label ID and graph embedding from the input examples
                label_ids.append(label[word_idx])
                sample_graph_embeddings.append(embedding[word_idx])
            previous_word_idx = word_idx

        # Append the label IDs, word IDs, and graph embeddings to their respective lists
        labels.append(label_ids)
        word_ids_list.append(word_ids)
        graph_embeddings.append(sample_graph_embeddings)

    # Add the word IDs, labels, and graph embeddings to the tokenized inputs
    tokenized_inputs["word_ids"] = word_ids_list
    tokenized_inputs["labels"] = labels
    tokenized_inputs["token_graph_embeddings"] = graph_embeddings

    # Return the tokenized inputs
    return tokenized_inputs

# Map the tokenize_and_align_labels function to the data in batches
data = data.map(tokenize_and_align_labels, batched=True, batch_size=5)

# Save the preprocessed data to disk
data.save_to_disk(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_with_token_graph_embeddings"
)

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Saving the dataset (0/7 shards):   0%|          | 0/144 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

## Split and pad Input IDs, Graph Embeddings and Labels to fit PLM

In [None]:
import pickle
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer

# Load Data
data = load_from_disk(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_with_token_graph_embeddings"
)

# Specify Model Maximum Number of input Token
max_token = 512

# Required Dicts
input_ids = {i: [] for i in data.keys()}
input_ids_splitted = {i: [] for i in data.keys()}
sentence_ids = {i: [] for i in data.keys()}
labels = {i: [] for i in data.keys()}
labels_splitted = {i: [] for i in data.keys()}
graph_embeddings = {i: [] for i in data.keys()}
graph_embeddings_splitted = {i: [] for i in data.keys()}

# Iterate over the splits in the data
for split in data.keys():
    input_ids[split] = data[split]["input_ids"]
    labels[split] = data[split]["labels"]

    # Iterate over each sample in the split
    for idx, sample in tqdm(enumerate(data[split]), total=len(data[split])):
        # Input Ids
        ## Split to fit PLM
        temp = [
            sample["input_ids"][i : i + max_token]
            for i in range(0, len(sample["input_ids"]), max_token)
        ]
        ## Pad the last sequence with padding token
        temp[-1] += [0 for i in range(max_token - len(temp[-1]))]
        input_ids_splitted[split] += temp
        sentence_ids[split] += [idx for _ in range(len(temp))]

        # Labels
        ## Split to fit PLM
        temp = [
            sample["labels"][i : i + max_token]
            for i in range(0, len(sample["labels"]), max_token)
        ]
        ## Pad the last sequence with -100 to make sure the tokens
        ## are not taken into account in the loss computation
        temp[-1] += [-100 for i in range(max_token - len(temp[-1]))]
        labels_splitted[split] += temp

        # Graph Embeddings
        sample_graph_embedding = np.array(sample["token_graph_embeddings"])
        graph_embeddings[split].append(sample_graph_embedding)

        ## Split to fit PLM
        temp = [
            sample_graph_embedding[i : i + max_token, :]
            for i in range(0, sample_graph_embedding.shape[0], max_token)
        ]
        ## Pad with embeddings of zeros
        padding = np.zeros((int(max_token - temp[-1].shape[0]), 192))
        temp[-1] = np.concatenate((temp[-1], padding))
        graph_embeddings_splitted[split] += temp


# Save the dictionaries to disk using pickle
with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(input_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_splitted_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(input_ids_splitted, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_sentence_ids_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(sentence_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_splitted_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(labels_splitted, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_token_graph_embeddings_v3_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(graph_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_token_graph_embeddings_splitted_v3_dictionary.pickle",
    "wb",
) as handle:
    pickle.dump(graph_embeddings_splitted, handle, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 144/144 [06:28<00:00,  2.70s/it]
100%|██████████| 100/100 [03:38<00:00,  2.19s/it]


## Train Model

In [None]:
import copy
import os
import random

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from seqeval.metrics import accuracy_score, f1_score
from tqdm import tqdm
from transformers import AdamW, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput

### Dataset Class

In [None]:
class LongDataset(Dataset):
    def __init__(self, input_ids, labels, sentence_ids, graph_embeddings):
        # Initialize the dataset with the input data
        self.input_ids = input_ids
        self.labels = labels
        self.sentence_ids = sentence_ids
        self.graph_embeddings = graph_embeddings

    def __len__(self):
        # Return the length of the dataset
        return len(self.sentence_ids)

    def __getitem__(self, idx):
        # Get an item from the dataset at the given index
        item = {}

        # Assign the input IDs as a long tensor
        item["input_ids"] = torch.tensor(self.input_ids[idx]).long()

        # Assign the labels as a long tensor
        item["labels"] = torch.tensor(self.labels[idx]).long()

        # Assign the sentence IDs
        item["sentence_ids"] = self.sentence_ids[idx]

        # Assign the graph embeddings as a float tensor
        item["graph_embeddings"] = torch.tensor(self.graph_embeddings[idx]).float()

        # Return the item
        return item

### Model Class

In [None]:
class SequenceTagger(nn.Module):
    def __init__(self):
        super().__init__()

        # Initialize the model with a pretrained language model
        self.plm = AutoModel.from_pretrained(model_ckpt)

        # Enable gradient computation for the language model parameters
        for param in self.plm.parameters():
            param.requires_grad = True

        # Linear projection layers for PLM and graph embeddings
        self.reproject_plm = torch.nn.Linear(1024, 1024)
        self.init_linear_weights(self.reproject_plm )

        # Dropout layer for regularization
        self.reproject_graph = torch.nn.Linear(768//4, 768//4)
        self.init_linear_weights(self.reproject_graph)

        # Linear layer for classification
        self.dropout = torch.nn.Dropout(0.05)
        self.linear = torch.nn.Linear(1024 + 768//4, 3)
        self.init_linear_weights(self.linear)

    def init_linear_weights(self, m):
        # Initialize the weights of a linear layer
        nn.init.xavier_uniform_(m.weight)
        nn.init.constant_(m.bias, 0)
        return

    def forward(self, input_ids=None, graph_embeddings=None, labels=None, **kwargs):
        # Forward pass of the model

        # Pass the input_ids through the pretrained language model
        x_plm = self.plm(input_ids)[0]

        # Apply linear projection to the PLM representation
        x_plm = self.reproject_plm(x_plm)

        # Apply linear projection to the graph embeddings
        x_graph = self.reproject_graph(graph_embeddings)

        # Concatenate the PLM and graph embeddings along the last dimension
        x = torch.cat((x_plm ,x_graph), dim=2)

        # Apply dropout regularization
        sequence_output = self.dropout(x)

        # Perform classification using a linear layer
        logits = self.linear(sequence_output)

        loss = None
        if labels is not None:
            # Compute the loss if labels are provided
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 3), labels.view(-1))

        # Return the model output as a `TokenClassifierOutput` object
        return TokenClassifierOutput(
            loss=loss, logits=logits, hidden_states=None, attentions=None
        )

### Load Preprocessed Data

In [None]:
import pickle

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_input_ids_splitted_dictionary.pickle",
    "rb",
) as handle:
    input_ids = pickle.load(handle)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_sentence_ids_dictionary.pickle",
    "rb",
) as handle:
    sentence_ids = pickle.load(handle)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_labels_splitted_dictionary.pickle",
    "rb",
) as handle:
    labels = pickle.load(handle)

with open(
    rf".\data\{dataset_name}\{dataset_name}_{model_name}_token_graph_embeddings_splitted_v3_dictionary.pickle",
    "rb",
) as handle:
    graph_embeddings = pickle.load(handle)

training_data = LongDataset(
    input_ids["train"],
    labels["train"],
    sentence_ids["train"],
    graph_embeddings["train"],
)

test_data = LongDataset(
    input_ids["test"], labels["test"], sentence_ids["test"], graph_embeddings["test"]
)

### Test Function

In [None]:
def test_f1(device, model, test_loader):
    # Mapping of tag index to tag label
    index2tag = {0: "B", 1: "I", 2: "O"}

    # Set the model in evaluation mode
    model.eval()

    # Initialize lists to store true and predicted labels for each sample
    total_true = [[] for _ in range(len(test_loader.dataset))]
    total_pred = [[] for _ in range(len(test_loader.dataset))]

    with torch.no_grad():
        # Iterate over the test data loader
        for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
            # Move input tensors and labels to the device
            input_ids = data["input_ids"].to(device)
            true = data["labels"].tolist()
            sentence_id = data["sentence_ids"].tolist()

            # Move graph embeddings to the device (if available)
            if "graph_embeddings" in data.keys():
                graph_embeddings = data["graph_embeddings"].to(device)
            else:
                graph_embeddings = None

            # Get the model's predicted tag scores
            tag_scores = model(
                input_ids=input_ids, graph_embeddings=graph_embeddings
            ).logits

            # Convert predicted tag scores to labels
            output = torch.argmax(tag_scores, dim=2).cpu().tolist()

            # Clean true and predicted labels by removing special tokens (-100)
            true_cleaned = [[index2tag[i] for i in j if i != -100] for j in true]
            prediction_cleaned = [
                [index2tag[i] for i, j in zip(k, z) if j != -100]
                for k, z in zip(output, true)
            ]

            # Append true and predicted labels to the respective original sample lists
            for idx, sample_id in enumerate(sentence_id):
                total_true[sample_id] += true_cleaned[idx]
                total_pred[sample_id] += prediction_cleaned[idx]

    # Compute the F1 score using the true and predicted labels
    return f1_score(total_true, total_pred)

### Set Seed Function

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    return

### Training Loop

In [None]:
try:
    os.mkdir(rf".\results")
except:
    pass

try:
    os.mkdir(rf".\data\{dataset_name}\{model_name}")
except:
    pass

# Best model saving name
model_save_name = "Semeval-FineKBIR+Graph"

In [None]:
# Check if CUDA is available, otherwise use CPU
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")

# List of seeds to iterate over
seed_list  = [7,117,777]

# Initialize a DataFrame to store the results
results_df = pd.DataFrame(columns=["Seed", "train_f1", "val_f1", "test_f1"])

# Iterate over the seeds
for i, seed in enumerate(seed_list):

    # Clear CUDA cache and delete the model if it exists
    torch.cuda.empty_cache()
    try:
        del model
    except:
        pass

    # Set Seed
    set_seed(seed)

    # Create a new instance of the SequenceTagger model
    model = SequenceTagger()

    # Move the model to the specified device
    model.to(device)

    # Create data loaders for training and testing
    train_loader = DataLoader(training_data, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

    # Initialize an AdamW optimizer for training the model
    optimizer = AdamW(model.parameters(), lr=5e-5)
    best_f1 = 0

    # Train the model for 10 epochs
    for epoch in range(10):
        model.train()
        for batch in tqdm(train_loader, total=len(train_loader)):
            optimizer.zero_grad()
            # Move input tensors and labels to the device
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            # Move graph embeddings to the device (if available)
            if "graph_embeddings" in batch.keys():
                graph_embeddings = batch["graph_embeddings"].to(device)
            else:
                graph_embeddings = None

            # Perform a forward pass through the model and compute the loss
            outputs = model(input_ids, labels=labels, graph_embeddings=graph_embeddings)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

        # Evaluate the model's F1 score on the test set
        model.eval()
        current_f1 = test_f1(device, model, test_loader)

        # Update the best F1 score and save a copy of the best model
        print(f"Epoch: {epoch+1} - F1: {current_f1}")
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_model = copy.deepcopy(model)

    # Save the best model with the current seed
    torch.save(
        best_model,
        rf".\data\{dataset_name}\{model_name}\{model_save_name}_seed_{seed}",
    )

    # Calculate the F1 score on the training and test sets with the best model
    results_df.loc[i, "Seed"] = seed
    results_df.loc[i, "train_f1"] = test_f1(device, best_model, train_loader)
    results_df.loc[i, "test_f1"] = test_f1(device, best_model, test_loader)

    # Save the results to a CSV file
    results_df.to_csv(
        rf".\results\{model_save_name}.csv",
        index=False,
    )