## Mount G-Drive (Only when using collab-notebook)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/NER_Model_for_i2b2/TakeHome_Assignment2/notebooks

/content/drive/MyDrive/NER_Model_for_i2b2/TakeHome_Assignment2/notebooks


In [3]:

%pwd

'/content/drive/MyDrive/NER_Model_for_i2b2/TakeHome_Assignment2/notebooks'

## Load the Dataset

In [32]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [33]:
def load_i2b2(path):
    """Takes the modified raw data and gets individual clinical note description along with it's label.
    Create word and label lists for each train, dev and test datasets."""
    notes = []
    labels = []
    tokens = []
    tags = []

    with open(path, "r") as f:
        for line in f:
            line = line.strip()

            if line == "_custom_note_separator_":
                # end of a note
                if tokens:  # avoid empty
                    notes.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                parts = line.split()
                if len(parts) == 2:
                    token, tag = parts
                    tokens.append(token)
                    tags.append(tag)

    # add last note if file doesn’t end with separator
    if tokens:
        notes.append(tokens)
        labels.append(tags)

    return notes, labels


train_tokens, train_labels = load_i2b2("../i2b2/train__modified.txt")
dev_tokens, dev_labels = load_i2b2("../i2b2/dev_modified.txt")
test_tokens, test_labels = load_i2b2("../i2b2/test_modified.txt")

In [34]:
# Create Tensor Dataset
def to_dataset(tokens_list, labels_list):
    return Dataset.from_dict({
        "tokens": tokens_list,
        "ner_tags": labels_list
    })

dataset = DatasetDict({
    "train": to_dataset(train_tokens, train_labels),
    "validation": to_dataset(dev_tokens, dev_labels),
    "test": to_dataset(test_tokens, test_labels)
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 27625
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 2447
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 50
    })
})


In [35]:
dataset["train"][10]

{'tokens': ['Unsigned'], 'ner_tags': ['O']}

In [36]:
# Collect unique labels and create label dictionary
unique_labels = sorted({tag for seq in train_labels+dev_labels+test_labels for tag in seq})
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Label2id:", label2id)
print("id2label:", id2label)


Label2id: {'B-problem': 0, 'B-test': 1, 'B-treatment': 2, 'I-problem': 3, 'I-test': 4, 'I-treatment': 5, 'O': 6}
id2label: {0: 'B-problem', 1: 'B-test', 2: 'B-treatment', 3: 'I-problem', 4: 'I-test', 5: 'I-treatment', 6: 'O'}


In [37]:
def encode_labels(example):
    example["ner_tags"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

dataset = dataset.map(encode_labels)

Map:   0%|          | 0/27625 [00:00<?, ? examples/s]

Map:   0%|          | 0/2447 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [38]:
dataset["train"][6]

{'tokens': ['Discharge', 'Summary'], 'ner_tags': [6, 6]}

## Model Training

**Model Justification**

I have used `emilyalsentzer/Bio_ClinicalBERT` because it is pretrained on large-scale clinical notes, captures domain-specific medical language, making it more accurate and efficient than general-purpose BERT models.

In [39]:
from transformers import AutoTokenizer

model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [40]:
# Create tokenized dataset
def tokenize_and_align_labels(examples):
    """
    Tokenizes input words and aligns their NER labels with tokens.

    - The first token of each word/first sub-word keeps its label.
    - Latter subword tokens, special tokens like ([CLS], [SEP]), and padding are assigned `-100`.

    Why remaining sub-words get `-100`?
    Trainer ignores `-100` during loss computation, so subwords don’t affect training and
    loss gets calculated for only the relevant(first) token of each word.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    # print("tokenized_inputs:", tokenized_inputs)
    labels = []
    for i, label_seq in enumerate(examples["ner_tags"]):
        # print(i, label_seq)
        # label_seq = examples["ner_tags"]
        # print("label_seq", label_seq)
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # print("words_ids:", word_ids)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_seq[word_idx])
            else:
                # for sub-tokens: ignore
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# example = dataset["train"][2]
# print(example)
# tokenize_and_align_labels(example)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/27625 [00:00<?, ? examples/s]

Map:   0%|          | 0/2447 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [19]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
%pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=054285ed6204f7edddc97a1aada0b564bad9f51c273beca082b9263d4744322a
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [21]:
# Defining Metrics to Compute for evaluation
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Convert predictions and labels to lists of labels (remove -100)
    true_labels = []
    true_predictions = []

    for pred, lab in zip(predictions, labels):
        curr_labels = []
        curr_preds = []
        for p_i, l_i in zip(pred, lab):
            if l_i != -100:  # skip ignored tokens
                curr_labels.append(id2label[l_i])
                curr_preds.append(id2label[p_i])
        true_labels.append(curr_labels)
        true_predictions.append(curr_preds)

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


In [23]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="../models",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="../logs",
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1084,0.13308,0.833942,0.869234,0.851222
2,0.0553,0.145806,0.838886,0.87399,0.856078
3,0.0381,0.162938,0.839726,0.874465,0.856744


TrainOutput(global_step=5181, training_loss=0.09425786116297608, metrics={'train_runtime': 1810.1403, 'train_samples_per_second': 45.784, 'train_steps_per_second': 2.862, 'total_flos': 5413986905184000.0, 'train_loss': 0.09425786116297608, 'epoch': 3.0})

## Overall Evaluation

In [25]:
# Overall
trainer.evaluate()


{'eval_loss': 0.16293832659721375,
 'eval_precision': 0.8397260273972603,
 'eval_recall': 0.8744650499286734,
 'eval_f1': 0.856743535988819,
 'eval_runtime': 18.3512,
 'eval_samples_per_second': 133.343,
 'eval_steps_per_second': 8.337,
 'epoch': 3.0}

## Entity Level Metrics Evaluation

In [26]:
import numpy as np
def get_true_label_true_preds(predictions_output):
  predictions = np.argmax(predictions_output.predictions, axis=2)
  labels = predictions_output.label_ids

  true_labels = []
  true_predictions = []

  for pred_seq, label_seq in zip(predictions, labels):
      curr_labels = []
      curr_preds = []
      for p, l in zip(pred_seq, label_seq):
          if l != -100:  # ignore padding tokens
              curr_labels.append(id2label[l])
              curr_preds.append(id2label[p])
      true_labels.append(curr_labels)
      true_predictions.append(curr_preds)

  return true_labels, true_predictions


In [None]:
# Run prediction on validation dataset
predictions_output = trainer.predict(tokenized_dataset["validation"])

true_labels, true_predictions = get_true_label_true_preds(predictions_output)

report = classification_report(true_labels, true_predictions)
print(report)

              precision    recall  f1-score   support

     problem       0.84      0.86      0.85       988
        test       0.86      0.90      0.88       504
   treatment       0.82      0.87      0.85       611

   micro avg       0.84      0.87      0.86      2103
   macro avg       0.84      0.88      0.86      2103
weighted avg       0.84      0.87      0.86      2103



## Load and Test Model

In [58]:
from transformers import BertForTokenClassification, BertTokenizerFast, Trainer

# path where Trainer saved your model (change to your actual path)
model_path = "../models/checkpoint-3454"

# reload tokenizer and model
loaded_tokenizer = BertTokenizerFast.from_pretrained(model_path)
loaded_model = BertForTokenClassification.from_pretrained(model_path)

# reinitialize trainer for evaluation
loaded_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer, args=TrainingArguments(report_to="none"))


  loaded_trainer = Trainer(model=loaded_model, tokenizer=loaded_tokenizer, args=TrainingArguments(report_to="none"))


In [59]:
metrics = loaded_trainer.predict(tokenized_dataset["test"])
true_labels, true_predictions = get_true_label_true_preds(metrics)

print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

     problem       0.79      0.86      0.82        43
        test       0.96      0.83      0.89        53
   treatment       1.00      1.00      1.00         2

   micro avg       0.87      0.85      0.86        98
   macro avg       0.91      0.90      0.90        98
weighted avg       0.88      0.85      0.86        98



In [105]:
import torch

def get_entities(prediction):
    # Group Entities
    entities = {"PROBLEM": [], "TREATMENT": [], "TEST": []}

    current_entity = []
    current_label = None

    for token, label in prediction:
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue

        entity = label.split("-")[-1].upper()
        if entity in entities:
            clean_token = token.replace("##", "")  # handle subwords
            if current_label == entity:
                # continuing the same entity
                # if it's a subword, append directly without space
                if token.startswith("##"):
                    current_entity[-1] = current_entity[-1] + clean_token
                else:
                    current_entity.append(clean_token)
            else:
                # save previous entity
                if current_entity and current_label:
                    entities[current_label].append(" ".join(current_entity))
                # start new entity
                current_entity = [clean_token]
                current_label = entity
        else:
            # save previous entity when label ends
            if current_entity and current_label:
                entities[current_label].append(" ".join(current_entity))
                current_entity, current_label = [], None

    # save last entity
    if current_entity and current_label:
        entities[current_label].append(" ".join(current_entity))

    return entities

def predict_entities(text):
    # tokenize with alignment
    tokens = loaded_tokenizer(text.split(),
                       truncation=True,
                       is_split_into_words=True,
                       return_tensors="pt")

    # Move tokens to the same device as the model
    device = loaded_model.device
    tokens = {key: value.to(device) for key, value in tokens.items()}

    with torch.no_grad():
        output = loaded_model(**tokens)

    predictions = torch.argmax(output.logits, dim=2)

    predicted_labels = [id2label[p.item()] for p in predictions[0]]
    tokens_out = loaded_tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

    results = [(tok, lab) for tok, lab in zip(tokens_out, predicted_labels)
               if tok not in ["[CLS]", "[SEP]", "[PAD]"]]

    entities = get_entities(results)
    return results, entities

# Examples
# note = "Patient was prescribed aspirin for chest pain."
# note = "Patient was prescribed aspirin for chest pain and headache."
# note = "The patient was given ibuprofen for fever and sore throat."
note = "CT scan of the chest showed signs of pneumonia."

prediction, entities = predict_entities(note)

# Entity Level Results
print("Problems:", entities["PROBLEM"])
print("Treatments:", entities["TREATMENT"])
print("Tests:", entities["TEST"])
print()

# Token Level Results
for token, label in prediction:
    print(f"{token:<15} {label}")

Problems: ['pneumonia']
Treatments: []
Tests: ['ct scan of the chest']

c               B-test
##t             I-test
scan            I-test
of              I-test
the             I-test
chest           I-test
showed          O
signs           O
of              O
pneumonia       B-problem
.               O


## Error Analysis

In [102]:
def build_error_dataframe(predictions_output, tokenized_dataset, id2label, tokenizer):
    preds = np.argmax(predictions_output.predictions, axis=2)
    labels = predictions_output.label_ids

    mismatches = []

    for i, (pred, lab, input_ids) in enumerate(zip(preds, labels, tokenized_dataset["input_ids"])):
        tokens = tokenizer.convert_ids_to_tokens(input_ids)

        for tok, p, l in zip(tokens, pred, lab):
            if l == -100:
                continue
            gold = id2label[l]
            pred_lab = id2label[p]

            if gold != pred_lab:
                mismatches.append({
                    "sample_id": i,
                    "token": tok,
                    "true_label": gold,
                    "pred_label": pred_lab
                })

    return pd.DataFrame(mismatches)


# Get mismatches dataframe
df_errors = build_error_dataframe(predictions_output, tokenized_dataset["test"], id2label, loaded_tokenizer)
df_errors.head()

Unnamed: 0,sample_id,token,true_label,pred_label
0,4,and,I-problem,O
1,4,numb,I-problem,B-problem
2,16,g,O,B-test
3,17,",",I-problem,O
4,21,diminished,O,B-problem


In [103]:
# Checking a sample id
print(" ".join(tokenized_dataset["test"][21]["tokens"]))

lbls = tokenized_dataset["test"][21]["ner_tags"]
[id2label[l] for l in lbls]

V1 through V3 , diminished light touch and pinprick in the left side .


['O',
 'O',
 'O',
 'O',
 'O',
 'B-test',
 'I-test',
 'O',
 'B-test',
 'O',
 'O',
 'O',
 'O',
 'O']

In [106]:
# Model Output
note = "V1 through V3, diminished light touch and pinprick in the left side."
prediction, entities = predict_entities(note)

# Entity Level Results
print("Problems:", entities["PROBLEM"])
print("Treatments:", entities["TREATMENT"])
print("Tests:", entities["TEST"])
print()

# Token Level Results
for token, label in prediction:
    print(f"{token:<15} {label}")

Problems: ['diminished light touch', 'pinprick in the left side']
Treatments: []
Tests: []

v               O
##1             O
through         O
v               O
##3             O
,               O
diminished      B-problem
light           I-problem
touch           I-problem
and             O
pin             B-problem
##p             I-problem
##rick          I-problem
in              I-problem
the             I-problem
left            I-problem
side            I-problem
.               O


**Probable Reason of error -**

- This i2b2 dataset often labels neurological exams like "diminished light touch" and "pinprick" as tests (they are exam findings).

- But semantically, they also look like problems/symptoms.

**Solution that might help -**
- Data Augumentation of the error cases and re-training