In [None]:
pip install -U transformers datasets seqeval scikit-learn

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.3 MB/s[0m eta [36

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive
/content/gdrive/MyDrive


# Training Fresh on Synthetic Data

In [None]:
import json
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from collections import defaultdict

In [None]:
# Load data
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}

with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],  # correcting stop -> end
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")  # Normalize label
    })

In [None]:
# Step 4: Prepare examples (text + labels)
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Train/test split
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)


In [None]:
# Step 6: Load tokenizer
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 7: Create label list
label_list = list({label["label"].replace(" ", "_") for _, anns in examples for label in anns})
label_list = ["O"] + ["B-" + label for label in label_list] + ["I-" + label for label in label_list]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [None]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                if labels_aligned[i] == "O":
                    labels_aligned[i] = "B-" + label.replace(" ", "_")
                else:
                    labels_aligned[i] = "I-" + label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [None]:
# Step 9: Tokenize and align the full dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

#Batch size= 8, num_train_epochs=5, learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,0.6277
40,0.0374
60,0.0087
80,0.0059
100,0.0046
120,0.0037
140,0.0044
160,0.0042
180,0.0033
200,0.004


TrainOutput(global_step=500, training_loss=0.030130796872079372, metrics={'train_runtime': 404.4121, 'train_samples_per_second': 9.891, 'train_steps_per_second': 1.236, 'total_flos': 1045290971136000.0, 'train_loss': 0.030130796872079372, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.30it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9897483690587139
Recall: 0.9960140679953107
F1: 0.9928713334112422

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.84      0.96      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      0.99      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 409.43 seconds


#Batch size= 16, num_train_epochs=5, learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,0.6245
40,0.0375
60,0.0077
80,0.0057
100,0.0046
120,0.0041
140,0.0044
160,0.0046
180,0.0038
200,0.0034


TrainOutput(global_step=250, training_loss=0.05676954618096352, metrics={'train_runtime': 404.4293, 'train_samples_per_second': 9.89, 'train_steps_per_second': 0.618, 'total_flos': 1045290971136000.0, 'train_loss': 0.05676954618096352, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=16, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.18it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9892923649906891
Recall: 0.9964830011723329
F1: 0.9928746641747459

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.83      0.99      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 409.42 seconds


#Batch size= 32, num_train_epochs=5, learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
20,0.7469
40,0.0415
60,0.0095
80,0.0067
100,0.0054
120,0.0052


TrainOutput(global_step=125, training_loss=0.13065847209095954, metrics={'train_runtime': 410.2876, 'train_samples_per_second': 9.749, 'train_steps_per_second': 0.305, 'total_flos': 1045290971136000.0, 'train_loss': 0.13065847209095954, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99921875
Precision: 0.9892948568768909
Recall: 0.9967174677608441
F1: 0.9929922915206728

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.83      1.00      0.91        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 417.86 seconds


#Batch size = 32, num_train_epochs=7, learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=7,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,0.626
40,0.0378
60,0.0083
80,0.0059
100,0.0047
120,0.0044
140,0.0043
160,0.0042


TrainOutput(global_step=175, training_loss=0.07986676424741745, metrics={'train_runtime': 554.0296, 'train_samples_per_second': 10.108, 'train_steps_per_second': 0.316, 'total_flos': 1463407359590400.0, 'train_loss': 0.07986676424741745, 'epoch': 7.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.27it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99921875
Precision: 0.9892948568768909
Recall: 0.9967174677608441
F1: 0.9929922915206728

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.83      1.00      0.91        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 558.86 seconds


#Batch size = 32, num_train_epochs=10, learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,0.6225
40,0.0313
60,0.0078
80,0.0056
100,0.0044
120,0.0042
140,0.004
160,0.0039
180,0.0038
200,0.0039


TrainOutput(global_step=250, training_loss=0.05602096834778786, metrics={'train_runtime': 788.3096, 'train_samples_per_second': 10.148, 'train_steps_per_second': 0.317, 'total_flos': 2090581942272000.0, 'train_loss': 0.05602096834778786, 'epoch': 10.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99921875
Precision: 0.989979025868096
Recall: 0.9960140679953107
F1: 0.9929873772791025

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.85      0.96      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      0.99      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 793.80 seconds


#Batch size = 32, num_train_epochs=5, learning rate = 1e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,1.2402
40,0.1513
60,0.0497
80,0.0219
100,0.0144
120,0.0123


TrainOutput(global_step=125, training_loss=0.23884755876660346, metrics={'train_runtime': 397.4043, 'train_samples_per_second': 10.065, 'train_steps_per_second': 0.315, 'total_flos': 1045290971136000.0, 'train_loss': 0.23884755876660346, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9895202608290639
Recall: 0.9962485345838218
F1: 0.9928729991821476

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.84      0.97      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      0.99      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 404.18 seconds


#Batch size = 32, num_train_epochs=5, learning rate = 4e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=4e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
20,0.397
40,0.0084
60,0.0043
80,0.0042
100,0.0035
120,0.0036


TrainOutput(global_step=125, training_loss=0.06752571719884873, metrics={'train_runtime': 446.4969, 'train_samples_per_second': 8.959, 'train_steps_per_second': 0.28, 'total_flos': 1045290971136000.0, 'train_loss': 0.06752571719884873, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.26it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9892923649906891
Recall: 0.9964830011723329
F1: 0.9928746641747459

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.83      0.99      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.97      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 453.76 seconds


#Batch size = 32, num_train_epochs=5, learning rate = 2e-5, linear learning

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,1.9359
40,0.2287
60,0.024
80,0.0072
100,0.0052
120,0.0049


TrainOutput(global_step=125, training_loss=0.3531482963860035, metrics={'train_runtime': 414.2937, 'train_samples_per_second': 9.655, 'train_steps_per_second': 0.302, 'total_flos': 1045290971136000.0, 'train_loss': 0.3531482963860035, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.23it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9890646812470917
Recall: 0.9967174677608441
F1: 0.9928763283895832

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.82      1.00      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.96      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 419.45 seconds


#Batch size = 32, num_train_epochs=5, learning rate = 2e-5, cosine

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

import time

# Start timer
start_time = time.time()

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./bert_base_cased_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()



Step,Training Loss
20,1.9359
40,0.2287
60,0.0233
80,0.0067
100,0.005
120,0.0048


TrainOutput(global_step=125, training_loss=0.35291144736111163, metrics={'train_runtime': 422.9199, 'train_samples_per_second': 9.458, 'train_steps_per_second': 0.296, 'total_flos': 1045290971136000.0, 'train_loss': 0.35291144736111163, 'epoch': 5.0})

In [None]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.21it/s]


In [None]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.99919921875
Precision: 0.9890646812470917
Recall: 0.9967174677608441
F1: 0.9928763283895832

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.82      1.00      0.90        78
     CONTACT       1.00      1.00      1.00      1128
        DATE       0.99      0.99      0.99      1696
          ID       0.98      0.99      0.98       419
    LOCATION       1.00      1.00      1.00       476
        NAME       1.00      1.00      1.00       468

   micro avg       0.99      1.00      0.99      4265
   macro avg       0.96      1.00      0.98      4265
weighted avg       0.99      1.00      0.99      4265

Time taken: 428.43 seconds
