In [1]:
pip install -U transformers datasets seqeval scikit-learn

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m3.8 MB/s[0m eta [36

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
%cd gdrive/MyDrive


Mounted at /content/gdrive
/content/gdrive/MyDrive


# Predict using pretrained KindLab Roberta-Deid

In [3]:
import json
import random
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline
import torch
from sklearn.metrics import accuracy_score


In [4]:
# Load KindLab model and tokenizer
model_name = "KindLab/roberta-deid"
tokenizer = AutoTokenizer.from_pretrained("roberta-base")  # since model has no tokenizer
model = AutoModelForTokenClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [5]:
# Load data
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}
with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [6]:
# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")
    })

# Step 4: Prepare examples
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Split train/test
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)

In [7]:
# Step 7: Create flat label list
label_list = ["O", "AGE", "CONTACT", "DATE", "ID", "LOCATION", "NAME", "PROFESSION"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [8]:
label_list
label2id

{'O': 0,
 'AGE': 1,
 'CONTACT': 2,
 'DATE': 3,
 'ID': 4,
 'LOCATION': 5,
 'NAME': 6,
 'PROFESSION': 7}

In [9]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                labels_aligned[i] = label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [10]:
# Step 9: Tokenize and align the full dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [11]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)


Evaluating:   0%|          | 0/13 [00:00<?, ?it/s][A
Evaluating:   8%|▊         | 1/13 [00:00<00:08,  1.35it/s][A
Evaluating:  15%|█▌        | 2/13 [00:00<00:04,  2.35it/s][A
Evaluating:  23%|██▎       | 3/13 [00:01<00:03,  2.97it/s][A
Evaluating:  31%|███       | 4/13 [00:01<00:02,  3.49it/s][A
Evaluating:  38%|███▊      | 5/13 [00:01<00:02,  3.90it/s][A
Evaluating:  46%|████▌     | 6/13 [00:01<00:01,  4.20it/s][A
Evaluating:  54%|█████▍    | 7/13 [00:01<00:01,  4.42it/s][A
Evaluating:  62%|██████▏   | 8/13 [00:02<00:01,  4.51it/s][A
Evaluating:  69%|██████▉   | 9/13 [00:02<00:00,  4.65it/s][A
Evaluating:  77%|███████▋  | 10/13 [00:02<00:00,  4.74it/s][A
Evaluating:  85%|████████▍ | 11/13 [00:02<00:00,  4.82it/s][A
Evaluating:  92%|█████████▏| 12/13 [00:03<00:00,  4.85it/s][A
Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.15it/s]


In [12]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))



Metrics computed using seqeval:
Accuracy: 0.9671484375
Precision: 0.5512715340442986
Recall: 0.765375854214123
F1: 0.6409155937052933

Detailed Classification Report:



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         AME       0.64      0.96      0.77       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.39      1.00      0.57        78
     OCATION       0.00      0.00      0.00       100
      ONTACT       1.00      0.02      0.04       100
   ROFESSION       0.00      0.00      0.00         0

   micro avg       0.55      0.77      0.64       878
   macro avg       0.58      0.57      0.48       878
weighted avg       0.75      0.77      0.69       878



In [13]:
# Prepare NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


Device set to use cuda:0


In [14]:
# Compare predictions with annotations
for doc in summaries[:5]:  # Compare first 5 examples
    doc_id = doc["document_id"]
    text = doc["text"]
    print(f"\n================== {doc_id} ==================")
    print(f"Text Sample:\n{text[:400]}...\n")

    # Ground truth annotations
    print("Ground Truth Annotations:")
    for ann in annotations_by_doc.get(doc_id, []):
        snippet = text[ann['start']:ann['end']]
        print(f"{ann['label']}: '{snippet}' (start: {ann['start']}, end: {ann['end']})")

    # Model predictions
    print("\nModel Predictions:")
    preds = ner(text)
    for pred in preds:
        print(f"{pred['entity_group']}: '{pred['word']}' (start: {pred['start']}, end: {pred['end']})")

    print("="*60)


Text Sample:
Name: Ashley Wolfe    Unit No: 1110277
Admission Date: 23/08/2023    Discharge Date: 25/08/2023
Date of Birth: 13/07/2009    Age: 57    Sex: F
Service: Paediatrics
Attending: Paula Sutton

Chief Complaint: Patient presented with complaints relevant to paediatrics evaluation.

History of Present Illness:
Ashley Wolfe, a 57-year-old logistics and distribution manager from Robertburgh, KS, was admitt...

Ground Truth Annotations:
NAME: 'Ashley Wolfe' (start: 6, end: 18)
AGE: '57' (start: 130, end: 132)
DATE: '23/08/2023' (start: 55, end: 65)
DATE: '25/08/2023' (start: 85, end: 95)
DATE: '13/07/2009' (start: 111, end: 121)
ID: '1110277' (start: 31, end: 38)
LOCATION: 'Robertburgh, KS' (start: 373, end: 388)
CONTACT: '053-606-5681x5409' (start: 1665, end: 1682)
NAME: 'Paula Sutton' (start: 175, end: 187)

Model Predictions:
NAME: ' Ashley Wolfe' (start: 6, end: 18)
ID: ' 1110277' (start: 31, end: 38)
DATE: ' 23/08/2023' (start: 55, end: 65)
DATE: ' 25/08/2023' (start: 85, end:

# Finetune KindLab with Synthetic Data with best set of hyperparameters

In [15]:
import json
import random
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline
from transformers import DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [16]:
# Step 1: Load discharge summaries
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}

# Step 2: Load annotations
with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

In [17]:
# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")
    })

# Step 4: Prepare examples
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Split train/test
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)

In [18]:
# Step 6: Load tokenizer
model_checkpoint = "KindLab/roberta-deid"
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Step 7: Create flat label list
label_list = ["O", "AGE", "CONTACT", "DATE", "ID", "LOCATION", "NAME", "PROFESSION"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [19]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                labels_aligned[i] = label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [20]:
# Step 9: Tokenize dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


In [21]:
!pip install evaluate
from transformers import TrainerCallback
from evaluate import load

# Load metrics
seqeval = load("seqeval")

# Custom compute_metrics function
def compute_metrics(predictions):
    preds, labels = predictions
    preds = preds.argmax(-1)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [22]:
# Step 11: Setup Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

# Step 12: Define training arguments (best hyperparameters based on tuning)
training_args = TrainingArguments(
    output_dir="./kindlab_roberta_deid_finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    remove_unused_columns=False
)

# Step 13: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Step 14: Train
trainer.train()

# Step 15: Save model
trainer.save_model("./kindlab_roberta_deid_finetuned")



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msamyakjainuiuc[0m ([33msamyakjainuiuc-university-of-illionis-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,0.2996
20,0.1196
30,0.0839
40,0.0258
50,0.0051
60,0.0034
70,0.0014
80,0.0014
90,0.0011
100,0.0012




In [23]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:03<00:00,  4.27it/s]


In [24]:
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))




Metrics computed using seqeval:
Accuracy: 0.9996875
Precision: 0.9831838565022422
Recall: 0.9988610478359908
F1: 0.9909604519774012

Detailed Classification Report:

              precision    recall  f1-score   support

         AME       1.00      1.00      1.00       200
         ATE       1.00      1.00      1.00       300
           D       1.00      1.00      1.00       100
          GE       0.84      0.99      0.91        78
     OCATION       1.00      1.00      1.00       100
      ONTACT       1.00      1.00      1.00       100

   micro avg       0.98      1.00      0.99       878
   macro avg       0.97      1.00      0.98       878
weighted avg       0.99      1.00      0.99       878



In [25]:
# Prepare NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

Device set to use cuda:0


In [26]:
# Compare predictions with annotations
for doc in summaries[:5]:  # Compare first 5 examples
    doc_id = doc["document_id"]
    text = doc["text"]
    print(f"\n================== {doc_id} ==================")
    print(f"Text Sample:\n{text[:400]}...\n")

    # Ground truth annotations
    print("Ground Truth Annotations:")
    for ann in annotations_by_doc.get(doc_id, []):
        snippet = text[ann['start']:ann['end']]
        print(f"{ann['label']}: '{snippet}' (start: {ann['start']}, end: {ann['end']})")

    # Model predictions
    print("\nModel Predictions:")
    preds = ner(text)
    for pred in preds:
        print(f"{pred['entity_group']}: '{pred['word']}' (start: {pred['start']}, end: {pred['end']})")

    print("="*60)


Text Sample:
Name: Ashley Wolfe    Unit No: 1110277
Admission Date: 23/08/2023    Discharge Date: 25/08/2023
Date of Birth: 13/07/2009    Age: 57    Sex: F
Service: Paediatrics
Attending: Paula Sutton

Chief Complaint: Patient presented with complaints relevant to paediatrics evaluation.

History of Present Illness:
Ashley Wolfe, a 57-year-old logistics and distribution manager from Robertburgh, KS, was admitt...

Ground Truth Annotations:
NAME: 'Ashley Wolfe' (start: 6, end: 18)
AGE: '57' (start: 130, end: 132)
DATE: '23/08/2023' (start: 55, end: 65)
DATE: '25/08/2023' (start: 85, end: 95)
DATE: '13/07/2009' (start: 111, end: 121)
ID: '1110277' (start: 31, end: 38)
LOCATION: 'Robertburgh, KS' (start: 373, end: 388)
CONTACT: '053-606-5681x5409' (start: 1665, end: 1682)
NAME: 'Paula Sutton' (start: 175, end: 187)

Model Predictions:
NAME: ' Ashley Wolfe' (start: 6, end: 18)
ID: ' 1110277' (start: 31, end: 38)
DATE: ' 23/08/2023' (start: 55, end: 65)
DATE: ' 25/08/2023' (start: 85, end:

# Training Fresh on Synthetic Data with best set of hyperparameters

In [27]:
import json
import random
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from collections import defaultdict

In [28]:
# Load data
import json
from collections import defaultdict
datapath = '/content/gdrive/MyDrive/full_discharge_summaries_and_annotations/'
with open(datapath+"discharge_summaries.json", "r") as f:
    summaries = json.load(f)
text_by_docid = {item["document_id"]: item["text"] for item in summaries}

with open(datapath+"annotations.json", "r") as f:
    annotations = json.load(f)

# Step 3: Group annotations by document_id
annotations_by_doc = defaultdict(list)
for ann in annotations:
    doc_id = ann["document_id"]
    annotations_by_doc[doc_id].append({
        "start": ann["start"],
        "end": ann["stop"],  # correcting stop -> end
        "label": (ann["entity_type"].replace("IDNUM", "ID")).replace("PHONE", "CONTACT")  # Normalize label
    })

In [29]:
# Step 4: Prepare examples (text + labels)
examples = []
for doc_id, labels in annotations_by_doc.items():
    if doc_id not in text_by_docid:
        continue
    text = text_by_docid[doc_id]
    examples.append((text, labels))

# Step 5: Train/test split
train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42)


In [30]:
# Step 6: Load tokenizer
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 7: Create label list
label_list = list({label["label"].replace(" ", "_") for _, anns in examples for label in anns})
label_list = ["O"] + ["B-" + label for label in label_list] + ["I-" + label for label in label_list]

In [31]:
label_list

['O',
 'B-ID',
 'B-DATE',
 'B-CONTACT',
 'B-LOCATION',
 'B-AGE',
 'B-NAME',
 'I-ID',
 'I-DATE',
 'I-CONTACT',
 'I-LOCATION',
 'I-AGE',
 'I-NAME']

In [32]:
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [33]:
# Step 8: Align labels with tokens
def align_labels_with_tokens(text, labels, tokenizer):
    tokenized_inputs = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_offsets_mapping=True
    )
    labels_aligned = ["O"] * len(tokenized_inputs["input_ids"])
    offset_mapping = tokenized_inputs.pop("offset_mapping")

    for entity in labels:
        start, end, label = entity["start"], entity["end"], entity["label"]
        for i, (offset_start, offset_end) in enumerate(offset_mapping):
            if offset_start >= end:
                break
            if offset_end > start and offset_start < end:
                if labels_aligned[i] == "O":
                    labels_aligned[i] = "B-" + label.replace(" ", "_")
                else:
                    labels_aligned[i] = "I-" + label.replace(" ", "_")

    return {
        "input_ids": tokenized_inputs["input_ids"],
        "attention_mask": tokenized_inputs["attention_mask"],
        "labels": [label2id.get(lbl, 0) for lbl in labels_aligned]
    }

In [34]:
# Step 9: Tokenize and align the full dataset
train_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in train_data])
test_dataset = Dataset.from_list([align_labels_with_tokens(x[0], x[1], tokenizer) for x in test_data])

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [35]:
# Step 10: Load model
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Step 11: Set up Data Collator
data_collator = DataCollatorForTokenClassification(
    tokenizer,
    padding=True,
    max_length=512,
    return_tensors="pt"
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
# Step 12: Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta_base_finetuned",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    remove_unused_columns=False
)

# Step 13: Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator
)

# Step 14: Train the model
trainer.train()

# Step 15: Save the model
trainer.save_model("./roberta_base_finetuned")




Step,Training Loss
20,0.8496
40,0.0719
60,0.0135
80,0.0074
100,0.0056
120,0.0053


In [37]:
#Evaluate Metrics
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

small_test_dataset = dataset["test"].select(range(100))
# Set up DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True, return_tensors="pt")
test_dataloader = DataLoader(small_test_dataset, batch_size=8, collate_fn=data_collator)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

all_preds = []
all_labels = []

# Evaluate
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        all_preds.append(predictions.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Stack all batches
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Decode predictions
true_predictions = []
true_labels = []

for prediction, label in zip(all_preds, all_labels):
    temp_pred = []
    temp_label = []
    for p, l in zip(prediction, label):
        if l == -100:
            continue
        temp_pred.append(id2label[p])
        temp_label.append(id2label[l])
    true_predictions.append(temp_pred)
    true_labels.append(temp_label)

Evaluating: 100%|██████████| 13/13 [00:02<00:00,  4.52it/s]


In [38]:
# Compute Metrics
print("Metrics computed using seqeval:")
print("Accuracy:", accuracy_score(true_labels, true_predictions))
print("Precision:", precision_score(true_labels, true_predictions))
print("Recall:", recall_score(true_labels, true_predictions))
print("F1:", f1_score(true_labels, true_predictions))
print("\nDetailed Classification Report:\n")
print(classification_report(true_labels, true_predictions))

Metrics computed using seqeval:
Accuracy: 0.9991796875
Precision: 0.9873449131513647
Recall: 0.9959949937421777
F1: 0.9916510903426792

Detailed Classification Report:

              precision    recall  f1-score   support

         AGE       0.80      1.00      0.89        78
     CONTACT       1.00      1.00      1.00       942
        DATE       0.99      0.99      0.99      1727
          ID       0.96      0.98      0.97       316
    LOCATION       1.00      1.00      1.00       461
        NAME       1.00      1.00      1.00       471

   micro avg       0.99      1.00      0.99      3995
   macro avg       0.96      1.00      0.98      3995
weighted avg       0.99      1.00      0.99      3995



In [39]:
from transformers import pipeline
# Prepare NER pipeline
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Compare predictions with annotations
for doc in summaries[:5]:  # Compare first 5 examples
    doc_id = doc["document_id"]
    text = doc["text"]
    print(f"\n================== {doc_id} ==================")
    print(f"Text Sample:\n{text[:400]}...\n")

    # Ground truth annotationsS
    print("Ground Truth Annotations:")
    for ann in annotations_by_doc.get(doc_id, []):
        snippet = text[ann['start']:ann['end']]
        print(f"{ann['label']}: '{snippet}' (start: {ann['start']}, end: {ann['end']})")

    # Model predictions
    print("\nModel Predictions:")
    preds = ner(text)
    for pred in preds:
        print(f"{pred['entity_group']}: '{pred['word']}' (start: {pred['start']}, end: {pred['end']})")

    print("="*60)

Device set to use cuda:0



Text Sample:
Name: Ashley Wolfe    Unit No: 1110277
Admission Date: 23/08/2023    Discharge Date: 25/08/2023
Date of Birth: 13/07/2009    Age: 57    Sex: F
Service: Paediatrics
Attending: Paula Sutton

Chief Complaint: Patient presented with complaints relevant to paediatrics evaluation.

History of Present Illness:
Ashley Wolfe, a 57-year-old logistics and distribution manager from Robertburgh, KS, was admitt...

Ground Truth Annotations:
NAME: 'Ashley Wolfe' (start: 6, end: 18)
AGE: '57' (start: 130, end: 132)
DATE: '23/08/2023' (start: 55, end: 65)
DATE: '25/08/2023' (start: 85, end: 95)
DATE: '13/07/2009' (start: 111, end: 121)
ID: '1110277' (start: 31, end: 38)
LOCATION: 'Robertburgh, KS' (start: 373, end: 388)
CONTACT: '053-606-5681x5409' (start: 1665, end: 1682)
NAME: 'Paula Sutton' (start: 175, end: 187)

Model Predictions:
NAME: ' Ashley' (start: 6, end: 12)
NAME: ' Wolfe' (start: 13, end: 18)
ID: ' 11' (start: 31, end: 33)
ID: '10' (start: 33, end: 35)
ID: '277' (start: 35, 