In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from seqeval.metrics import f1_score, accuracy_score
from seqeval.metrics import classification_report as seqeval_classification_report    
from collections import defaultdict
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
MAX_LEN = 174


In [4]:
# #Labels for EWT-LOTR
# labels_to_ids = {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
# ids_to_labels = {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

In [5]:
# Load the config.json file
config_path = 'curve_train/config.json'
with open(config_path, 'r') as f:
    config = json.load(f)

# Extract label2id and id2label and convert keys to integers
labels_to_ids = config['label2id']
ids_to_labels = {int(k): v for k, v in config['id2label'].items()}

# Print the extracted mappings (optional)
print("labels_to_ids:", labels_to_ids)
print("ids_to_labels:", ids_to_labels)


labels_to_ids: {'B-CHAR': 0, 'I-LOC': 1, 'I-CHAR': 2, 'O': 3, 'B-ORG': 4, 'B-LOC': 5}
ids_to_labels: {0: 'B-CHAR', 1: 'I-LOC', 2: 'I-CHAR', 3: 'O', 4: 'B-ORG', 5: 'B-LOC'}


In [6]:
len(ids_to_labels)

6

In [None]:
# Load the model with the extracted labels
model = BertForTokenClassification.from_pretrained('curve_train', num_labels=len(ids_to_labels))
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:
tokenizer = BertTokenizerFast.from_pretrained("final_tokenizer")

In [12]:
sentence = "Oooohhh Frodo Aaaahh Gimli My precious Wake up Wake up Wake up sleepies We must go yes we must go at once"
words = sentence.split()
inputs = tokenizer(words,
             is_split_into_words=True,
             return_offsets_mapping=True,
             padding='max_length',
             truncation=True,
             max_length=MAX_LEN,
             return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
print('ids', ids)
mask = inputs["attention_mask"].to(device)
print('mask', mask)
# forward pass
outputs = model(ids, attention_mask=mask)
print('outputs:', outputs)

logits = outputs[0]
print("logits", logits)
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
print('active_logits', active_logits)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
print('flattened_predictions', flattened_predictions)
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
print('tokens', tokens)

token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
print('token_predictions', token_predictions)

wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
print('wp_preds', wp_preds)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

ids tensor([[  101,  1051,  9541, 11631, 23644, 10424,  7716,  2080, 13360,  4430,
          2232, 21025, 19968,  2072,  2026,  9062,  5256,  2039,  5256,  2039,
          5256,  2039,  3637,  3111,  2057,  2442,  2175,  2748,  2057,  2442,
          2175,  2012,  2320,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     

In [65]:
from collections import defaultdict

MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 7
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda



# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get the name and other details of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids[label] for label in word_labels]

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

def train_model(training_set, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss


test_tokens, test_tags = read_data("./all_test.iob2")

def replace_per_tags(tags_list):
    updated_tags_list = []
    for tags in tags_list:
        updated_tags = []
        for tag in tags:
            if tag == "B-PER":
                updated_tags.append("B-CHAR")
            elif tag == "I-PER":
                updated_tags.append("I-CHAR")
            else:
                updated_tags.append(tag)
        updated_tags_list.append(updated_tags)
    return updated_tags_list

test_tags = replace_per_tags(test_tags)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)

# Initialize a dictionary to hold the counts
tag_counts = defaultdict(int)

# Iterate through each list in test_tags and count the occurrences of each tag
for sentence in test_tags:
    for tag in sentence:
        tag_counts[tag] += 1

# Convert the defaultdict to a regular dictionary for easier printing
tag_counts = dict(tag_counts)

# Print the counts for each tag
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")
#ewt+lotr
# labels_to_ids = {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
# ids_to_labels = {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

# EWT
# labels_to_ids = {'I-LOC': 0, 'B-CHAR': 1, 'O': 2, 'B-LOC': 3, 'I-CHAR': 4, 'I-ORG': 5, 'B-ORG': 6}
# ids_to_labels = {0: 'I-LOC', 1: 'B-CHAR', 2: 'O', 3: 'B-LOC', 4: 'I-CHAR', 5: 'I-ORG', 6: 'B-ORG'}

# Create training and testing datasets
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

# Function to count tag occurrences
def count_tags(tags_list):
    tag_counts = defaultdict(int)
    for sentence in tags_list:
        for tag in sentence:
            tag_counts[tag] += 1
    return tag_counts

# Count initial tag occurrences in test_tags
initial_tag_counts = count_tags(test_tags)
print("Initial tag counts in test_tags:", dict(initial_tag_counts))

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1_score, report


# Train and evaluate the model on the entire dataset

model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)


# Evaluating the model
eval_loss, eval_accuracy, f1_score, eval_report = valid(model, testing_loader)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
print(eval_report)


# Display the evaluation metrics in a DataFrame
metrics = {
    "eval_loss": eval_loss,
    "accuracy": eval_accuracy,
    "f1_score": f1_score,
    "report": eval_report
}
metrics_df = pd.DataFrame([metrics])
print(metrics_df)

# Flatten the classification report for easier viewing
flat_reports = []
for label, scores in eval_report.items():
    flat_reports.append({
        "label": label,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1-score": scores["f1-score"],
        "support": scores["support"]
    })

reports_df = pd.DataFrame(flat_reports)
print(reports_df)


Number of GPUs available: 1
GPU 0: NVIDIA A100 80GB PCIe
  Memory Allocated: 2.55 GB
  Memory Cached: 2.91 GB
O: 48418
B-CHAR: 1269
I-CHAR: 328
B-LOC: 533
B-ORG: 324
I-LOC: 74
I-ORG: 276
Initial tag counts in test_tags: {'O': 48418, 'B-CHAR': 1269, 'I-CHAR': 328, 'B-LOC': 533, 'B-ORG': 324, 'I-LOC': 74, 'I-ORG': 276}
Validation loss per 100 evaluation steps: 0.18824134767055511
Validation Loss: 0.1471862779178012
Validation Accuracy: 0.976338292140096
F1 Score: 0.7369226843582548
{'CHAR': {'precision': 0.7928007023705005, 'recall': 0.7115839243498818, 'f1-score': 0.7500000000000001, 'support': 1269}, 'LOC': {'precision': 0.8571428571428571, 'recall': 0.8555347091932458, 'f1-score': 0.856338028169014, 'support': 533}, 'ORG': {'precision': 0.6967213114754098, 'recall': 0.2623456790123457, 'f1-score': 0.3811659192825112, 'support': 324}, 'micro avg': {'precision': 0.8053541550474066, 'recall': 0.6792097836312324, 'f1-score': 0.7369226843582548, 'support': 2126}, 'macro avg': {'precision':

In [52]:
# metrics_df.to_csv('data/LOTR_on_all_tags_metric.csv', index=False)

In [53]:
# reports_df.to_csv('data/LOTR_on_all_tags_report.csv', index=False)

In [47]:
from collections import defaultdict

MAX_LEN = 174
BATCH_SIZE = 64
EPOCHS = 7
MAX_GRAD_NORM = 5
MODEL_NAME = 'bert-base-uncased'
from torch import cuda



# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Get the name and other details of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
        print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
else:
    print("CUDA is not available. Running on CPU.")

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True,
                                  return_offsets_mapping=True,
                                  padding='max_length',
                                  truncation=True,
                                  max_length=self.max_len)

        labels = [labels_to_ids.get(label, 0) for label in word_labels]  # Assign default value 0 for missing labels

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

    def __len__(self):
        return self.len

def read_data(file_path):
    sentences, labels = [], []
    sentence, label = [], []
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            if line.startswith("#"):
                continue
            elif line == "\n":
                if sentence:
                    sentences.append(sentence)
                    labels.append(label)
                    sentence, label = [], []
            else:
                parts = line.strip().split("\t")
                sentence.append(parts[1].lower())  # Convert the token to lowercase before appending
                label.append(clean_tag(parts[2]))
    if sentence:
        sentences.append(sentence)
        labels.append(label)
    return sentences, labels

def clean_tag(tag):
    if tag.count('-') > 1:
        prefix, entity = tag.split('-', 1)
        tag = f"{prefix}-{entity.replace('-', '')}"
    return tag

def train_model(training_set, model, optimizer):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0

    training_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs.loss
        tr_logits = outputs.logits

        tr_loss += loss.item()
        nb_tr_steps += 1

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    return epoch_loss


test_tokens, test_tags = read_data("./all_test.iob2")

def replace_per_tags(tags_list):
    updated_tags_list = []
    for tags in tags_list:
        updated_tags = []
        for tag in tags:
            if tag == "B-PER":
                updated_tags.append("B-CHAR")
            elif tag == "I-PER":
                updated_tags.append("I-CHAR")
            else:
                updated_tags.append(tag)
        updated_tags_list.append(updated_tags)
    return updated_tags_list

test_tags = replace_per_tags(test_tags)

data_test = {'sentence': [" ".join(sentence) for sentence in test_tokens],
             'word_labels': [",".join(tags) for tags in test_tags]}

df_test = pd.DataFrame(data_test)

# Initialize a dictionary to hold the counts
tag_counts = defaultdict(int)

# Iterate through each list in test_tags and count the occurrences of each tag
for sentence in test_tags:
    for tag in sentence:
        tag_counts[tag] += 1

# Convert the defaultdict to a regular dictionary for easier printing
tag_counts = dict(tag_counts)

# Print the counts for each tag
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")
#ewt+lotr
# labels_to_ids = {'B-CHAR': 0, 'O': 1, 'I-CHAR': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
# ids_to_labels = {0: 'B-CHAR', 1: 'O', 2: 'I-CHAR', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

# EWT
# labels_to_ids = {'I-LOC': 0, 'B-CHAR': 1, 'O': 2, 'B-LOC': 3, 'I-CHAR': 4, 'I-ORG': 5, 'B-ORG': 6}
# ids_to_labels = {0: 'I-LOC', 1: 'B-CHAR', 2: 'O', 3: 'B-LOC', 4: 'I-CHAR', 5: 'I-ORG', 6: 'B-ORG'}

# Create training and testing datasets
testing_set = dataset(df_test, tokenizer, MAX_LEN)

test_params = {'batch_size': BATCH_SIZE, 'shuffle': False, 'num_workers': 0}
testing_loader = DataLoader(testing_set, **test_params)

# Function to count tag occurrences
def count_tags(tags_list):
    tag_counts = defaultdict(int)
    for sentence in tags_list:
        for tag in sentence:
            tag_counts[tag] += 1
    return tag_counts

# Count initial tag occurrences in test_tags
initial_tag_counts = count_tags(test_tags)
print("Initial tag counts in test_tags:", dict(initial_tag_counts))

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)

            # Forward pass
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
            eval_logits = outputs.logits

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Compute evaluation accuracy
            active_logits = eval_logits.view(-1, model.config.num_labels)  # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)  # shape (batch_size * seq_len,)

            for i in range(labels.size(0)):
                label = labels[i]
                pred = flattened_predictions.view(labels.size(0), labels.size(1))[i]

                active_accuracy = label != -100  # shape (seq_len,)
                label = torch.masked_select(label, active_accuracy)
                pred = torch.masked_select(pred, active_accuracy)

                eval_labels.append([ids_to_labels[id.item()] for id in label])
                eval_preds.append([ids_to_labels[id.item()] for id in pred])

                tmp_eval_accuracy = accuracy_score(label.cpu().numpy(), pred.cpu().numpy())
                eval_accuracy += tmp_eval_accuracy

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = accuracy_score(eval_labels, eval_preds)
    F1_score = f1_score(eval_labels, eval_preds)
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"F1 Score: {F1_score}")
    report = seqeval_classification_report(eval_labels, eval_preds, output_dict=True)
    print(report)
    
    return eval_loss, eval_accuracy, F1_score, report


# Train and evaluate the model on the entire dataset

model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)


# Evaluating the model
eval_loss, eval_accuracy, f1_score, eval_report = valid(model, testing_loader)
print(f"Eval Loss: {eval_loss}, Eval Accuracy: {eval_accuracy}")
print(eval_report)


# Display the evaluation metrics in a DataFrame
metrics = {
    "eval_loss": eval_loss,
    "accuracy": eval_accuracy,
    "f1_score": f1_score,
    "report": eval_report
}
metrics_df = pd.DataFrame([metrics])
print(metrics_df)

# Flatten the classification report for easier viewing
flat_reports = []
for label, scores in eval_report.items():
    flat_reports.append({
        "label": label,
        "precision": scores["precision"],
        "recall": scores["recall"],
        "f1-score": scores["f1-score"],
        "support": scores["support"]
    })

reports_df = pd.DataFrame(flat_reports)
print(reports_df)


Number of GPUs available: 1
GPU 0: NVIDIA A100 80GB PCIe
  Memory Allocated: 1.74 GB
  Memory Cached: 2.53 GB
O: 48418
B-CHAR: 1269
I-CHAR: 328
B-LOC: 533
B-ORG: 324
I-LOC: 74
I-ORG: 276
Initial tag counts in test_tags: {'O': 48418, 'B-CHAR': 1269, 'I-CHAR': 328, 'B-LOC': 533, 'B-ORG': 324, 'I-LOC': 74, 'I-ORG': 276}
Validation loss per 100 evaluation steps: 0.19724011421203613
Validation Loss: 0.2985211599331636
Validation Accuracy: 0.9614618718519387
F1 Score: 0.5350760202720726
{'CHAR': {'precision': 0.7712820512820513, 'recall': 0.4867313915857605, 'f1-score': 0.5968253968253968, 'support': 1545}, 'LOC': {'precision': 0.6738544474393531, 'recall': 0.46904315196998125, 'f1-score': 0.5530973451327434, 'support': 533}, 'ORG': {'precision': 1.0, 'recall': 0.0030864197530864196, 'f1-score': 0.006153846153846154, 'support': 324}, 'micro avg': {'precision': 0.7446176688938382, 'recall': 0.41756869275603664, 'f1-score': 0.5350760202720726, 'support': 2402}, 'macro avg': {'precision': 0.815