In [1]:
pip install -U transformers datasets seqeval scikit-learn

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.8 MB/s[0m eta [36

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive
/content/gdrive/MyDrive


In [3]:
import json
import random
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [4]:
# Step 1: Load discharge summaries
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}

# Step 2: Load annotations
with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

In [5]:
# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")
    })

# Step 4: Prepare examples
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Split train/test
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)

In [6]:
# Step 6: Load tokenizer
model_checkpoint = "KindLab/bert-deid"
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Step 7: Create flat label list
label_list = ["O", "AGE", "CONTACT", "DATE", "ID", "LOCATION", "NAME", "PROFESSION"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                labels_aligned[i] = label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [8]:
# Step 9: Tokenize dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


config.json:   0%|          | 0.00/983 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [9]:
!pip install evaluate
from transformers import TrainerCallback
from evaluate import load

# Load metrics
seqeval = load("seqeval")

# Custom compute_metrics function
def compute_metrics(predictions):
    preds, labels = predictions
    preds = preds.argmax(-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m103.7 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

# Batch size 8, Epochs 3, Learning rate = 2e-5

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.1368
20,0.0195
30,0.0074
40,0.0046
50,0.0023
60,0.0019
70,0.0013
80,0.0014
90,0.0013
100,0.0012




TrainOutput(global_step=300, training_loss=0.006454857971984893, metrics={'train_runtime': 273.6406, 'train_samples_per_second': 8.771, 'train_steps_per_second': 1.096, 'total_flos': 627146234265600.0, 'train_loss': 0.006454857971984893, 'epoch': 3.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.08it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99970703125
Precision: 0.9853768278965129
Recall: 0.9977220956719818
F1: 0.9915110356536503

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.97      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      1.00      0.99       878
weighted avg       0.99      1.00      0.99       878

Time taken: 288.30 seconds


# Batch size 16, Epochs 3, Learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.159
20,0.0243
30,0.0068
40,0.0028
50,0.0031
60,0.0017
70,0.0015
80,0.0017
90,0.0015
100,0.0011


TrainOutput(global_step=150, training_loss=0.013908469875653584, metrics={'train_runtime': 244.9448, 'train_samples_per_second': 9.798, 'train_steps_per_second': 0.612, 'total_flos': 627146234265600.0, 'train_loss': 0.013908469875653584, 'epoch': 3.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=16, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 7/7 [00:02<00:00,  2.35it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9996484375
Precision: 0.9809630459126539
Recall: 0.9977220956719818
F1: 0.9892715979672502

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       0.99      0.99      0.99       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.83      0.99      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.98      1.00      0.99       878

Time taken: 249.68 seconds


# Batch size 32, Epochs 3, Learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.147
20,0.0184
30,0.0072
40,0.0038
50,0.0027
60,0.002
70,0.002


TrainOutput(global_step=75, training_loss=0.02460856596628825, metrics={'train_runtime': 240.955, 'train_samples_per_second': 9.96, 'train_steps_per_second': 0.311, 'total_flos': 627146234265600.0, 'train_loss': 0.02460856596628825, 'epoch': 3.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=32, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 4/4 [00:02<00:00,  1.35it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9994921875
Precision: 0.9721603563474388
Recall: 0.9943052391799544
F1: 0.9831081081081081

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       0.97      0.97      0.97       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.80      1.00      0.89        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.97      0.99      0.98       878
   macro avg       0.96      1.00      0.98       878
weighted avg       0.98      0.99      0.98       878

Time taken: 245.63 seconds


# Batch size 16, Epochs 5, Learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.1586
20,0.0238
30,0.0065
40,0.0027
50,0.0029
60,0.0016
70,0.0014
80,0.0015
90,0.0012
100,0.0009




TrainOutput(global_step=250, training_loss=0.008512625098228454, metrics={'train_runtime': 395.5678, 'train_samples_per_second': 10.112, 'train_steps_per_second': 0.632, 'total_flos': 1045243723776000.0, 'train_loss': 0.008512625098228454, 'epoch': 5.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=16, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 7/7 [00:02<00:00,  2.34it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9996875
Precision: 0.9853603603603603
Recall: 0.9965831435079726
F1: 0.9909399773499433

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.96      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      0.99      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 400.13 seconds


# Batch size 16, Epochs 4, Learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.1588
20,0.024
30,0.0066
40,0.0027
50,0.003
60,0.0017
70,0.0014
80,0.0016
90,0.0013
100,0.001


TrainOutput(global_step=200, training_loss=0.010516795320436359, metrics={'train_runtime': 314.2294, 'train_samples_per_second': 10.184, 'train_steps_per_second': 0.636, 'total_flos': 836194979020800.0, 'train_loss': 0.010516795320436359, 'epoch': 4.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=16, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 7/7 [00:03<00:00,  2.31it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99966796875
Precision: 0.9820828667413214
Recall: 0.9988610478359908
F1: 0.9904009034443816

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.83      0.99      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.98      1.00      0.99       878

Time taken: 319.00 seconds


# Batch size 8, Epochs 5, Learning rate = 2e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.1366
20,0.0193
30,0.0073
40,0.0045
50,0.0022
60,0.0018
70,0.0012
80,0.0013
90,0.0012
100,0.0011




TrainOutput(global_step=500, training_loss=0.004088789828121662, metrics={'train_runtime': 399.4973, 'train_samples_per_second': 10.013, 'train_steps_per_second': 1.252, 'total_flos': 1045243723776000.0, 'train_loss': 0.004088789828121662, 'epoch': 5.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.44it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99970703125
Precision: 0.9853768278965129
Recall: 0.9977220956719818
F1: 0.9915110356536503

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.97      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      1.00      0.99       878
weighted avg       0.99      1.00      0.99       878

Time taken: 404.45 seconds


# Batch size 8, Epochs 5, Learning rate = 1e-5

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.1937
20,0.0644
30,0.0274
40,0.0132
50,0.0068
60,0.0052
70,0.0029
80,0.0025
90,0.0027
100,0.003




TrainOutput(global_step=500, training_loss=0.007224588507786393, metrics={'train_runtime': 405.0872, 'train_samples_per_second': 9.874, 'train_steps_per_second': 1.234, 'total_flos': 1045243723776000.0, 'train_loss': 0.007224588507786393, 'epoch': 5.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.39it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99966796875
Precision: 0.9831649831649831
Recall: 0.9977220956719818
F1: 0.9903900508762012

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.84      0.97      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 410.04 seconds


# Batch size 8, Epochs 5, Learning rate = 1e-6

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-6,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.3928
20,0.2836
30,0.2185
40,0.1749
50,0.1548
60,0.1351
70,0.1204
80,0.1151
90,0.1036
100,0.0965




TrainOutput(global_step=500, training_loss=0.05581866376101971, metrics={'train_runtime': 404.2408, 'train_samples_per_second': 9.895, 'train_steps_per_second': 1.237, 'total_flos': 1045243723776000.0, 'train_loss': 0.05581866376101971, 'epoch': 5.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.51it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99830078125
Precision: 0.9298813376483279
Recall: 0.9817767653758542
F1: 0.9551246537396121

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       0.87      0.93      0.90       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.71      1.00      0.83        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       0.95      0.98      0.97       100

   micro avg       0.93      0.98      0.96       878
   macro avg       0.92      0.98      0.95       878
weighted avg       0.94      0.98      0.96       878

Time taken: 408.77 seconds


# Phase 2 Experiments, 1e-5, 5 Epochs with dynamic learning rate

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.4272
20,0.3648
30,0.2522
40,0.1706
50,0.1262
60,0.0859
70,0.0411
80,0.0273
90,0.0171
100,0.0141




TrainOutput(global_step=500, training_loss=0.03182210679817945, metrics={'train_runtime': 488.5453, 'train_samples_per_second': 8.188, 'train_steps_per_second': 1.023, 'total_flos': 1045243723776000.0, 'train_loss': 0.03182210679817945, 'epoch': 5.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.21it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")

Metrics computed using seqeval:
Accuracy: 0.9996484375




Precision: 0.9820627802690582
Recall: 0.9977220956719818
F1: 0.9898305084745762

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.83      0.97      0.89        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.98      1.00      0.99       878

Time taken: 498.10 seconds


# Phase 2 Experiments, 1e-5, 7 Epochs with dynamic learning rate

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.429
20,0.3833
30,0.2971
40,0.1974
50,0.1533
60,0.1142
70,0.0769
80,0.0458
90,0.0297
100,0.0244




TrainOutput(global_step=700, training_loss=0.026434664853316334, metrics={'train_runtime': 573.3405, 'train_samples_per_second': 9.767, 'train_steps_per_second': 1.221, 'total_flos': 1463341213286400.0, 'train_loss': 0.026434664853316334, 'epoch': 7.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.43it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99966796875
Precision: 0.984251968503937
Recall: 0.9965831435079726
F1: 0.9903791737408036

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.84      0.96      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      0.99      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 577.81 seconds


# Phase 2 Experiments, 1e-5, 10 Epochs with dynamic learning rate

In [None]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [None]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=1e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.4303
20,0.3966
30,0.3441
40,0.2352
50,0.1807
60,0.142
70,0.112
80,0.0876
90,0.0564
100,0.0387




TrainOutput(global_step=1000, training_loss=0.02191794638428837, metrics={'train_runtime': 813.8262, 'train_samples_per_second': 9.83, 'train_steps_per_second': 1.229, 'total_flos': 2090487447552000.0, 'train_loss': 0.02191794638428837, 'epoch': 10.0})

In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.50it/s]


In [None]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99966796875
Precision: 0.9820828667413214
Recall: 0.9988610478359908
F1: 0.9904009034443816

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.83      0.99      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.98      1.00      0.99       878

Time taken: 818.18 seconds


# Phase 2 Experiments, 2e-5, 5 Epochs with dynamic learning rate

In [10]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [11]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.4205
20,0.2959
30,0.1873
40,0.125
50,0.0681
60,0.0288
70,0.0117
80,0.0072
90,0.0058
100,0.0048




TrainOutput(global_step=500, training_loss=0.023866761840879918, metrics={'train_runtime': 442.3822, 'train_samples_per_second': 9.042, 'train_steps_per_second': 1.13, 'total_flos': 1045243723776000.0, 'train_loss': 0.023866761840879918, 'epoch': 5.0})

In [12]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.26it/s]


In [13]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99970703125
Precision: 0.9853768278965129
Recall: 0.9977220956719818
F1: 0.9915110356536503

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.97      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      1.00      0.99       878
weighted avg       0.99      1.00      0.99       878

Time taken: 450.93 seconds


# Phase 2 Experiments, 2e-5, 7 Epochs with dynamic learning rate

In [14]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [15]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.4244
20,0.3359
30,0.2143
40,0.1475
50,0.0982
60,0.0505
70,0.0216
80,0.0132
90,0.0098
100,0.0079




TrainOutput(global_step=700, training_loss=0.01973340507463685, metrics={'train_runtime': 596.6076, 'train_samples_per_second': 9.386, 'train_steps_per_second': 1.173, 'total_flos': 1463341213286400.0, 'train_loss': 0.01973340507463685, 'epoch': 7.0})

In [16]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.28it/s]


In [17]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9996875
Precision: 0.9853603603603603
Recall: 0.9965831435079726
F1: 0.9909399773499433

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.96      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      0.99      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 601.73 seconds


# Phase 2 Experiments, 2e-5, 10 Epochs with dynamic learning rate

In [18]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [19]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.4272
20,0.3648
30,0.2522
40,0.1706
50,0.1262
60,0.0859
70,0.0411
80,0.0273
90,0.0171
100,0.0141




TrainOutput(global_step=1000, training_loss=0.016177289169514553, metrics={'train_runtime': 837.372, 'train_samples_per_second': 9.554, 'train_steps_per_second': 1.194, 'total_flos': 2090487447552000.0, 'train_loss': 0.016177289169514553, 'epoch': 10.0})

In [20]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.38it/s]


In [21]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9996875
Precision: 0.9831838565022422
Recall: 0.9988610478359908
F1: 0.9909604519774012

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.84      0.99      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 842.43 seconds


# Phase 2 Experiments, 2e-5, 5 Epochs with COSINE learning rate

In [10]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [11]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.4205
20,0.2959
30,0.1873
40,0.125
50,0.0681
60,0.0288
70,0.0117
80,0.0072
90,0.0058
100,0.0048




TrainOutput(global_step=500, training_loss=0.02386358405277133, metrics={'train_runtime': 431.6318, 'train_samples_per_second': 9.267, 'train_steps_per_second': 1.158, 'total_flos': 1045243723776000.0, 'train_loss': 0.02386358405277133, 'epoch': 5.0})

In [12]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.55it/s]


In [13]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.9996875
Precision: 0.9853603603603603
Recall: 0.9965831435079726
F1: 0.9909399773499433

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.96      0.90        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      0.99      0.98       878
weighted avg       0.99      1.00      0.99       878

Time taken: 440.89 seconds


# Phase 2 Experiments, 2e-5, 5 Epochs with fixed learning rate and weight decay

In [14]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

In [15]:
import time

# Start timer
start_time = time.time()

# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./kindlab_bert_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()




Step,Training Loss
10,0.1366
20,0.0193
30,0.0073
40,0.0045
50,0.0022
60,0.0018
70,0.0012
80,0.0013
90,0.0012
100,0.0011




TrainOutput(global_step=500, training_loss=0.004089230146259069, metrics={'train_runtime': 409.4137, 'train_samples_per_second': 9.77, 'train_steps_per_second': 1.221, 'total_flos': 1045243723776000.0, 'train_loss': 0.004089230146259069, 'epoch': 5.0})

In [16]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.47it/s]


In [17]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

# End timer
end_time = time.time()

# Compute elapsed time
elapsed_time = end_time - start_time

print(f"Time taken: {elapsed_time:.2f} seconds")



Metrics computed using seqeval:
Accuracy: 0.99970703125
Precision: 0.9853768278965129
Recall: 0.9977220956719818
F1: 0.9915110356536503

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.85      0.97      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.99      1.00      0.99       878
   macro avg       0.98      1.00      0.99       878
weighted avg       0.99      1.00      0.99       878

Time taken: 414.45 seconds
