In [1]:
pip install -U transformers datasets seqeval scikit-learn

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m823.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.7 MB/s[0m eta [3

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive
/content/gdrive/MyDrive


# Training Fresh on Synthetic Data with best set of hyperparameters

In [3]:
import json
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from collections import defaultdict

In [4]:
# Load data
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}

with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],  # correcting stop -> end
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")  # Normalize label
    })

In [16]:
# Step 4: Prepare examples (text + labels)
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Train/test split
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)


In [17]:
# Step 6: Load tokenizer
model_checkpoint = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 7: Create label list
label_list = list({label["label"].replace(" ", "_") for _, anns in examples for label in anns})
label_list = ["O"] + ["B-" + label for label in label_list] + ["I-" + label for label in label_list]



In [18]:
label_list

['O',
 'B-AGE',
 'B-NAME',
 'B-ID',
 'B-DATE',
 'B-LOCATION',
 'B-CONTACT',
 'I-AGE',
 'I-NAME',
 'I-ID',
 'I-DATE',
 'I-LOCATION',
 'I-CONTACT']

In [19]:
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [20]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                if labels_aligned[i] == "O":
                    labels_aligned[i] = "B-" + label.replace(" ", "_")
                else:
                    labels_aligned[i] = "I-" + label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [21]:
# Step 9: Tokenize and align the full dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [22]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./deberta_base_cased_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()

# Step 15: Save the model
trainer.save_model("./deberta_base_cased_finetuned")




Step,Training Loss
20,0.9529
40,0.0532
60,0.0145
80,0.0074
100,0.0053
120,0.0045
140,0.0048
160,0.0045
180,0.0034
200,0.0043


In [24]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:04<00:00,  2.71it/s]


In [25]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

Metrics computed using seqeval:
Accuracy: 0.99923828125
Precision: 0.9877539660450877
Recall: 0.9957912457912458
F1: 0.9917563224814867

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.83      0.99      0.90        78
     CONTACT       1.00      1.00      1.00       854
        DATE       0.99      0.99      0.99      1490
          ID       0.97      0.99      0.98       288
    LOCATION       1.00      1.00      1.00       442
        NAME       1.00      1.00      1.00       412

   micro avg       0.99      1.00      0.99      3564
   macro avg       0.96      0.99      0.98      3564
weighted avg       0.99      1.00      0.99      3564



In [26]:
from transformers import pipeline
# Prepare NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Compare predictions with annotations
for doc in summaries[:5]:  # Compare first 5 examples
    doc_id = doc["document_id"]
    text = doc["text"]
    print(f"\n================== {doc_id} ==================")
    print(f"Text Sample:\n{text[:400]}...\n")

    # Ground truth annotationsS
    print("Ground Truth Annotations:")
    for ann in annotations_by_doc.get(doc_id, []):
        snippet = text[ann['start']:ann['end']]
        print(f"{ann['label']}: '{snippet}' (start: {ann['start']}, end: {ann['end']})")

    # Model predictions
    print("\nModel Predictions:")
    preds = ner(text)
    for pred in preds:
        print(f"{pred['entity_group']}: '{pred['word']}' (start: {pred['start']}, end: {pred['end']})")

    print("="*60)

Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Text Sample:
Name: Ashley Wolfe    Unit No: 1110277
Admission Date: 23/08/2023    Discharge Date: 25/08/2023
Date of Birth: 13/07/2009    Age: 57    Sex: F
Service: Paediatrics
Attending: Paula Sutton

Chief Complaint: Patient presented with complaints relevant to paediatrics evaluation.

History of Present Illness:
Ashley Wolfe, a 57-year-old logistics and distribution manager from Robertburgh, KS, was admitt...

Ground Truth Annotations:
NAME: 'Ashley Wolfe' (start: 6, end: 18)
AGE: '57' (start: 130, end: 132)
DATE: '23/08/2023' (start: 55, end: 65)
DATE: '25/08/2023' (start: 85, end: 95)
DATE: '13/07/2009' (start: 111, end: 121)
ID: '1110277' (start: 31, end: 38)
LOCATION: 'Robertburgh, KS' (start: 373, end: 388)
CONTACT: '053-606-5681x5409' (start: 1665, end: 1682)
NAME: 'Paula Sutton' (start: 175, end: 187)

Model Predictions:
NAME: 'Ashley' (start: 5, end: 12)
NAME: 'Wolfe' (start: 12, end: 18)
ID: '11' (start: 30, end: 33)
ID: '10' (start: 33, end: 35)
ID: '277' (start: 35, end