In [None]:
!pip install transformers datasets seqeval -q

In [None]:
import pandas as pd
import ast
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
from seqeval.metrics import classification_report

In [None]:
from datasets import load_dataset

# Load dataset
raw_dataset = load_dataset("singh-aditya/MACCROBAT_biomedical_ner")  # Replace with actual dataset name

# Print dataset structure
print(raw_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['full_text', 'ner_info', 'tokens', 'ner_labels'],
        num_rows: 200
    })
})


In [None]:
# Extract labels from the dataset features
features = raw_dataset["train"].features
label_list = features["ner_labels"].feature.names
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=True
    )
    all_labels = []
    for i, word_ids in enumerate(tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["tokens"]))):
        labels = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                labels.append(-100)
            elif word_id != prev_word_id:
                labels.append(examples["ner_labels"][i][word_id])
            else:
                labels.append(-100)
            prev_word_id = word_id
        all_labels.append(labels)
    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

In [None]:
model_name = "bert-base-uncased"

# Tokenization
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Split dataset
train_test = raw_dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test["train"].map(tokenize_and_align_labels, batched=True)
train_dataset = train_dataset.remove_columns(raw_dataset["train"].column_names)
test_dataset = train_test["test"].map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.remove_columns(raw_dataset["train"].column_names)


print("Train samples:", len(train_dataset))
print("Test samples:", len(test_dataset))

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Train samples: 160
Test samples: 40


In [None]:
!unzip /content/bert-med-ner-model.zip -d /content/bert-med-ner-model

Archive:  /content/bert-med-ner-model.zip
replace /content/bert-med-ner-model/special_tokens_map.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

ner_model = AutoModelForTokenClassification.from_pretrained("/content/bert-med-ner-model")
ner_tokenizer = AutoTokenizer.from_pretrained("/content/bert-med-ner-model")

In [None]:
!pip install gradio



In [None]:
import gradio as gr
import torch
import json
import os

# Function to merge subtokens into readable words
def merge_subtokens(subtokens):
    merged = ""
    for token in subtokens:
        if token.startswith("##"):
            merged += token[2:]
        else:
            if merged:
                merged += " " + token
            else:
                merged = token
    return merged

# Convert display string to dictionary
def parse_display_str_to_dict(display_str):
    output_dict = {}
    for line in display_str.strip().split("\n"):
        if ": " in line:
            key, value = line.split(": ", 1)
            items = [v.strip() for v in value.split(",")]
            output_dict[key] = items
    return output_dict

# Save dictionary to a JSON file
def save_dict_to_json(output_dict, filename="ner_output_pretty.json"):
    with open(filename, "w") as f:
        for idx, (k, v) in enumerate(output_dict.items()):
            line = json.dumps({k: v}, separators=(", ", ": "))
            if idx == 0:
                f.write("{\n")
            f.write(f"  {line[1:-1]}")
            if idx < len(output_dict) - 1:
                f.write(",\n")
            else:
                f.write("\n}")

# Main inference + backend JSON saving
def ner_inference_with_json(text):
    inputs = ner_tokenizer(text, return_tensors="pt")
    inputs = {k: v.to(ner_model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = ner_model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)[0].cpu().numpy()
    tokens = ner_tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    structured_output = {}
    current_entity_tokens = []
    current_entity_type = None

    for token, pred in zip(tokens, predictions):
        label = id2label.get(pred, "O")
        if token in ["[CLS]", "[SEP]"]:
            continue
        if token.startswith("##"):
            if current_entity_tokens:
                current_entity_tokens.append(token)
            else:
                current_entity_tokens = [token]
            continue

        new_entity_type = label.split("-")[-1] if label != "O" else None

        if not current_entity_tokens:
            if label != "O":
                current_entity_tokens = [token]
                current_entity_type = new_entity_type
        else:
            if label != "O" and new_entity_type == current_entity_type:
                current_entity_tokens.append(token)
            else:
                entity_text = merge_subtokens(current_entity_tokens)
                structured_output.setdefault(current_entity_type, []).append(entity_text)
                current_entity_tokens = []
                current_entity_type = None
                if label != "O":
                    current_entity_tokens = [token]
                    current_entity_type = new_entity_type

    if current_entity_tokens and current_entity_type:
        entity_text = merge_subtokens(current_entity_tokens)
        structured_output.setdefault(current_entity_type, []).append(entity_text)

    # Format for display
    display_str = ""
    for k, v in structured_output.items():
        display_str += f"{k}: {', '.join(v)}\n"

    # --- BACKEND LOGIC: convert + save to JSON
    parsed_dict = parse_display_str_to_dict(display_str)
    os.makedirs("ner_outputs", exist_ok=True)
    save_dict_to_json(parsed_dict, "ner_outputs/ner_output_pretty.json")

    return display_str.strip()

In [None]:
# Gradio UI
demo = gr.Interface(
    fn=ner_inference_with_json,
    inputs=gr.Textbox(lines=5, placeholder="Enter clinical/medical text here..."),
    outputs="text",
    title="Medical NER - BERT",
    description="Enter medical sentences to extract entities using the fine-tuned BERT model."
)

demo.launch(debug=True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2d567730a60f9c384b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2d567730a60f9c384b.gradio.live




In [None]:
with open("/content/ner_outputs/ner_output_pretty.json", "r") as f:
    print(f.read())

{
  "AGE": ["60 - year"],
  "SEX": ["old male"],
  "DISEASE_DISORDER": ["hx", "ckd", "sob", "lvh"],
  "DIAGNOSTIC_PROCEDURE": ["cad", "ecg", "on exam", "there", "jvd", "s3", "labs", "bnp", "ef", "he", "tte", "mibi", "assess perfusion", "function"],
  "CLINICAL_EVENT": ["presented", "given", "started", "scheduled"],
  "SIGN_SYMPTOM": ["orthopnea", "rales"],
  "LAB_VALUE": ["+", "elevated", "low"],
  "DETAILED_DESCRIPTION": ["bibasilar"],
  "MEDICATION": ["furosemide iv", "acei"]
}


In [None]:
!pip install pinecone-client
!pip install pinecone sentence-transformers datasets google-generativeai pandas tqdm



In [None]:
from pinecone import Pinecone, ServerlessSpec
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from datasets import load_dataset
from google.colab import drive


drive.mount('/content/drive')

pinecone_key_path = "/content/drive/My Drive/keys/pinecone_key.txt"
gemini_key_path = "/content/drive/My Drive/keys/gemini_key.txt"

with open(pinecone_key_path, "r") as f:
    PINECONE_API_KEY = f.read().strip()

with open(gemini_key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()


pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "umls-ner-rag"
index = pc.Index(index_name)

genai.configure(api_key=GEMINI_API_KEY)


model = SentenceTransformer("all-MiniLM-L6-v2")



#Gemini 1.5 Flash and Gemini 1.0

In [None]:
from google.colab import drive
import google.generativeai as genai

drive.mount('/content/drive')

key_path = "/content/drive/My Drive/GenAI_keys/gemini_api.txt"

with open(key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()

genai.configure(api_key=GEMINI_API_KEY)

gemini = genai.GenerativeModel("gemini-1.5-flash")



#Flan T5

In [None]:
!pip install gradio torch transformers sentence-transformers pinecone-client google-generativeai anthropic



In [None]:
import json

# Load the nicely formatted structured JSON
with open("/content/ner_outputs/ner_output_pretty.json", "r") as f:
    ner_input_dict = json.load(f)

In [None]:
with open("/content/ner_outputs/ner_output_pretty.json", "r") as f:
    print(f.read())

{
  "AGE": ["60 - year"],
  "SEX": ["old male"],
  "DISEASE_DISORDER": ["hx", "ckd", "sob", "lvh"],
  "DIAGNOSTIC_PROCEDURE": ["cad", "ecg", "on exam", "there", "jvd", "s3", "labs", "bnp", "ef", "he", "tte", "mibi", "assess perfusion", "function"],
  "CLINICAL_EVENT": ["presented", "given", "started", "scheduled"],
  "SIGN_SYMPTOM": ["orthopnea", "rales"],
  "LAB_VALUE": ["+", "elevated", "low"],
  "DETAILED_DESCRIPTION": ["bibasilar"],
  "MEDICATION": ["furosemide iv", "acei"]
}


In [None]:
import re
def enhance_ner_output_without_RAG(ner_input_dict):
    entity_list = []

    for label, entities in ner_input_dict.items():
        for entity in entities:
            entity_list.append((label, entity))

    # Build the prompt **without any context**
    entity_string = ", ".join([f"{label}: {ent}" for label, ent in entity_list])

    prompt = f"""
You are a clinical AI assistant.

You are given noisy NER output from a medical note with categories and entities.
Some entities may be abbreviated, misspelled, or incomplete.

No context is provided to you. Do not hallucinate or use general knowledge.

Entities:
{entity_string}

Instructions:
1. For each entity:
  - Enhance the entity using the provided context (if it's available) (expand abbreviations, correct misspellings, normalize names based on context).
  - Then, if your own medical knowledge allows you to further improve or clarify the entity beyond what the context provided, make that additional improvement.
  - extract the abbreviations and its fullform. Replace the abbrevation with fullform, For example ASA is Aspirin
  - If no context is available for that entity, rely on your own medical knowledge to enhance the entity.
  - Expand abbreviations where possible.
  - Correct obvious misspellings.
  - Normalize drug, symptom, or disease names as best as possible.
  - Most important, Do not hallucinate or use general knowledge.
2. Keep the original entity categories.
3. Do not invent new entities that were not in the input.
4. Return your output as valid JSON.

Return JSON like:
{{
  "Symptoms": ["Shortness of breath", "Fever"],
  "Diagnosis": ["Aspirin"]
}}
"""

    response = gemini_1_5_flash.generate_content(prompt)
    return response.text


In [None]:
# Call your RAG + Gemini enhancer
enhanced_result_without_RAG = enhance_ner_output_without_RAG(ner_input_dict)

In [None]:
import json
import re

# Step 1: Remove the markdown code fences (```json and ```)
cleaned_result = re.sub(r"^```json\n|\n```$", "", enhanced_result_without_RAG.strip())

# Step 2: Load as dict
parsed = json.loads(cleaned_result)

# Step 3: Save as prettified format (one label per line, list of values inline)
with open("prettified_enhanced_ner_without_RAG.json", "w") as f:
    for idx, (k, v) in enumerate(parsed.items()):
        line = json.dumps({k: v}, separators=(", ", ": "))
        if idx == 0:
            f.write("{\n")
        f.write(f"  {line[1:-1]}")
        if idx < len(parsed) - 1:
            f.write(",\n")
        else:
            f.write("\n}")


In [None]:
with open("prettified_enhanced_ner_without_RAG.json", "r") as f:
    print(f.read())

{
  "AGE": ["60 years old"],
  "SEX": ["Male"],
  "DISEASE_DISORDER": ["History (hx)", "Chronic kidney disease (CKD)", "Shortness of breath (SOB)", "Left ventricular hypertrophy (LVH)"],
  "DIAGNOSTIC_PROCEDURE": ["Coronary artery disease (CAD)", "Electrocardiogram (ECG)", "Physical examination (on exam)", "Jugular venous distention (JVD)", "S3 heart sound (S3)", "Laboratory tests (labs)", "B-type natriuretic peptide (BNP)", "Ejection fraction (EF)", "Echocardiogram (TTE)", "MIBI scan (Mibi)", "Assess perfusion", "Assess function"],
  "CLINICAL_EVENT": ["Presented", "Given", "Started", "Scheduled"],
  "SIGN_SYMPTOM": ["Orthopnea", "Rales"],
  "LAB_VALUE": ["Positive (+)", "Elevated", "Low"],
  "DETAILED_DESCRIPTION": ["Bibasilar"],
  "MEDICATION": ["Furosemide IV", "ACE inhibitor (ACEI)"]
}


#With RAG Context

In [None]:
import re
def enhance_ner_output_RAG(ner_input_dict, top_k=100):
    all_context_blocks = []
    entity_list = []

    for label, entities in ner_input_dict.items():
        for entity in entities:
            entity_list.append((label, entity))
            # Embed + query Pinecone
            emb = model.encode([entity])[0].tolist()
            matches = index.query(
                vector=emb,
                top_k=top_k,
                namespace="umls",
                include_metadata=True
            )["matches"]

            # Keyword filter
            query_tokens = set(re.findall(r'\w+', entity.lower()))
            def score(txt): return len(set(re.findall(r'\w+', txt.lower())) & query_tokens)

            ranked = sorted(matches, key=lambda m: score(m["metadata"]["text"]), reverse=True)
            context = "\n".join(m["metadata"]["text"] for m in ranked if score(m["metadata"]["text"]) > 0)

            if context:
                all_context_blocks.append(f"Context for '{entity}' in category '{label}':\n{context}\n")

    # Gemini Prompt
    full_context = "\n\n".join(all_context_blocks)
    entity_string = ", ".join([f"{label}: {ent}" for label, ent in entity_list])

    prompt = f"""
You are a clinical AI assistant.

You are given noisy NER output from a medical note with categories and entities.
Some entities may be abbreviated, misspelled, or incomplete.

You are also provided with additional retrieved knowledge context, which may help clarify abbreviations, correct misspellings, and normalize entity names.

Entities:
{entity_string}

Knowledge Context:
{full_context}

Instructions:
1. For each entity:
  - Enhance the entity using the provided context (if it's available) (expand abbreviations, correct misspellings, normalize names based on context).
  - Then, if your own medical knowledge allows you to further improve or clarify the entity beyond what the context provided, make that additional improvement.
  - extract the abbreviations and its fullform. Replace the abbrevation with fullform, For example ASA is Aspirin
  - If no context is available for that entity, rely on your own medical knowledge to enhance the entity.
  - Expand abbreviations where possible.
  - Correct obvious misspellings.
  - Normalize drug, symptom, or disease names as best as possible.
2. Keep the original entity categories.
3. Do not invent new entities that were not in the input.
4. Return your output as valid JSON.

Return JSON like:
{{
  "Symptoms": ["Shortness of breath", "Fever"],
  "Diagnosis": ["Aspirin"]
}}
"""

    response = gemini_1_5_flash.generate_content(prompt)
    return response.text

In [None]:
# Call your RAG + Gemini enhancer
enhanced_result_RAG = enhance_ner_output_RAG(ner_input_dict)

In [None]:
import json
import re

# Step 1: Remove the markdown code fences (```json and ```)
cleaned_result = re.sub(r"^```json\n|\n```$", "", enhanced_result_RAG.strip())

# Step 2: Load as dict
parsed = json.loads(cleaned_result)

# Step 3: Save as prettified format (one label per line, list of values inline)
with open("prettified_enhanced_ner_RAG.json", "w") as f:
    for idx, (k, v) in enumerate(parsed.items()):
        line = json.dumps({k: v}, separators=(", ", ": "))
        if idx == 0:
            f.write("{\n")
        f.write(f"  {line[1:-1]}")
        if idx < len(parsed) - 1:
            f.write(",\n")
        else:
            f.write("\n}")


In [None]:
with open("prettified_enhanced_ner_RAG.json", "r") as f:
    print(f.read())

{
  "AGE": ["60 years old"],
  "SEX": ["Male"],
  "DISEASE_DISORDER": ["History of", "Chronic kidney disease", "Shortness of breath", "Left ventricular hypertrophy"],
  "DIAGNOSTIC_PROCEDURE": ["Coronary artery disease testing", "Electrocardiogram", "Physical examination", "Jugular venous distention", "S3 heart sound", "Laboratory tests", "B-type natriuretic peptide (BNP) level", "Ejection fraction", "Transthoracic echocardiogram", "Myocardial perfusion imaging with sestamibi (MIBI scan)", "Assessment of myocardial perfusion", "Assessment of left ventricular function"],
  "CLINICAL_EVENT": ["Presented", "Given", "Started", "Scheduled"],
  "SIGN_SYMPTOM": ["Orthopnea", "Rales (crackles)"],
  "LAB_VALUE": ["Positive", "Elevated", "Low"],
  "DETAILED_DESCRIPTION": ["Bibasilar"],
  "MEDICATION": ["Intravenous furosemide", "ACE inhibitor"]
}


In [2]:
!git remote remove origin
!git remote add origin https://<YOUR_GITHUB_TOKEN>@github.com/Saqib-Chy/Enhancing-Clinical-Named-Entity-Recognition.git
!git branch -M main
!git push origin main --force



[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/.git/
[master (root-commit) 526d86b] Clean repo, removed secrets
 21 files changed, 51025 insertions(+)
 create mode 100644 .config/.last_opt_in_prompt.yaml
 create mode 100644 .config/.last_survey_prompt.yaml
 create mode 100644 .config/.last_update_check.json
 create mode 100644 .config/active_config
 create mode 100644 .config/config_sentinel
 create mode 100644 .config/configurations/config_default
 create mode 100644 .config/default_configs.db
 create mode 1006