In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 200)

df = pd.read_parquet("hf://datasets/argilla/medical-domain/data/train-00000-of-00001-67e4e7207342a623.parquet")

def extract_label(pred):
    if isinstance(pred, (list, np.ndarray)) and len(pred) > 0 and isinstance(pred[0], dict):
        return pred[0].get("label")
    return None

df['label'] = df['prediction'].apply(extract_label)
df['text_length'] = df['metrics'].apply(lambda x: x.get('text_length') if isinstance(x, dict) else None)

# drop empty columns
df = df.drop(columns=['inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'metadata', 'status', 'event_timestamp', 'metrics'], errors='ignore')

  from .autonotebook import tqdm as notebook_tqdm


# 1. Investigate which NER types appear (manual inspection)

In [None]:
# df['text'].sample(20).tolist()

After manually inspecting 20 randomly sampled clinical notes, the following types of named entities appear frequently and consistently throughout the dataset:

Core Medical Entity Types
1.	DISEASE / CONDITION: "fracture", "polycythemia vera", "pneumonia", "multiple sclerosis", "otitis media"

2.	PROCEDURE / SURGERY: "colonoscopy", "laparoscopy", "arthroscopy", "right middle lobectomy", "heart catheterization"
3.	ANATOMY / BODY PART: "radius and ulna", "left shin", "rotator cuff", "middle lobe", "cervical spine"
4.	MEDICATION: "methadone", "aspirin prophylaxis", "prednisone", "amoxicillin", "Zithromax"
5.	LAB VALUE / MEASUREMENT: "CBC 41,900", "CRP 6.7", "BP 144/85", "weight 61.8 kg", "temperature 99.5°F"
6.	SYMPTOM / FINDING: "pain", "swelling", "wheezing", "fatigue", "rash", "tenderness"

Conclusion:
The dataset is rich in medical terminology, with DISEASE, PROCEDURE, ANATOMY, MEDICATION, LAB_VALUE, and SYMPTOM being the most prominent NER categories. These will be used to define the custom medical NER schema in the next steps.

# 2. Apply spaCy’s standard NER classifier

In [4]:
from tqdm import tqdm
import spacy
import subprocess
import sys
import multiprocessing

model_name = "en_core_web_md"
try:
    nlp = spacy.load(model_name)
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    nlp = spacy.load(model_name)

texts = df['text'].tolist()

ents_list = []
for doc in tqdm(nlp.pipe(texts, batch_size=32, n_process=multiprocessing.cpu_count()), total=len(texts)):
    ents_list.append([(ent.text, ent.label_) for ent in doc.ents])

df['spacy_ents'] = ents_list

  0%|          | 0/4966 [00:00<?, ?it/s]Process Process-1:
  0%|          | 0/4966 [00:02<?, ?it/s]Traceback (most recent call last):
  File "/Users/matthias/miniconda3/envs/nlp/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/matthias/miniconda3/envs/nlp/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/matthias/miniconda3/envs/nlp/lib/python3.11/site-packages/spacy/language.py", line 2408, in _apply_pipes
    byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/matthias/miniconda3/envs/nlp/lib/python3.11/site-packages/spacy/language.py", line 2408, in <listcomp>
    byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/matthias/miniconda3/envs/nlp/lib/python3.11/site-pa

KeyboardInterrupt: 

In [None]:
df[['text', 'spacy_ents']].head()

In [None]:
# compute entity frequencies
from collections import Counter

ent_counter = Counter()
for ents in df['spacy_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

# 3. Evaluate spaCy NER (automatic + manual)

In [None]:
sample_df = df.sample(100, random_state=42)
sample_df[['text', 'spacy_ents']].head()

### Manual Evaluation of spaCy NER (100 Entities)

We sampled 100 random entities from the model output and evaluated whether each prediction is correct in the medical context.

| Entity             | spaCy Label | Correct?     | Comment                                                      |
|--------------------|-------------|--------------|--------------------------------------------------------------|
| Iron               | ORG         | ❌ Incorrect | Should be DISEASE / LAB_VALUE, not an organization          |
| Diverticulosis     | PERSON      | ❌ Incorrect | A disease misclassified as a person                         |
| Colonoscopy        | ORG         | ❌ Incorrect | A procedure, not an organization                            |
| MAC                | PROCEDURE   | ❌ Incorrect | This is anesthesia type; spaCy invented PROCEDURE label     |
| Olympus            | ORG         | ✔️ Correct-ish | Device manufacturer — close enough                       |
| 2 years            | DATE        | ✔️ Correct   | Correct temporal expression                                  |
| LV gram            | PERSON      | ❌ Incorrect | Medical procedure (angiography), not a person               |
| Mynx               | ORG         | ✔️ Correct   | Brand name of closure device — ORG is fine                  |
| 7.5 mg             | QUANTITY    | ✔️ Correct   | Correct numeric quantity                                     |
| P160               | PRODUCT     | ✔️ Correct-ish | Likely device code; okay                                 |
| Post LV gram       | FAC         | ❌ Incorrect | Facility? No — medical procedure                            |
| 9                  | DATE        | ❌ Incorrect | Cardinal number, not a date                                 |
| L1 to S2           | FAC         | ❌ Incorrect | Anatomy, not a facility                                      |
| T11–12             | ORG         | ❌ Incorrect | Anatomy level                                                |
| 4.6 mm             | QUANTITY    | ✔️ Correct   | Measured lesion size                                         |
| AP                 | ORG         | ❌ Incorrect | Should be imaging orientation "anterior-posterior"          |
| %25                | PERCENT     | ✔️ Correct   | Correct percentage                                           |
| Diverticulitis     | PERSON      | ❌ Incorrect | Disease misclassified                                        |
| 68-year-old        | DATE        | ❌ Incorrect | Age, not a date                                              |
| PSA                | ORG         | ❌ Incorrect | Laboratory test ("PSA"), not organization                   |
| 5.5                | DATE        | ❌ Incorrect | Quantity, not date                                           |
| Dr. X              | PERSON      | ✔️ Correct   | Correct doctor name                                          |
| Proscar            | PRODUCT     | ✔️ Correct   | Drug name → PRODUCT OK                                       |
| 300 cc             | QUANTITY    | ✔️ Correct   | Measurement                                                   |
| 2003               | DATE        | ✔️ Correct   | Year                                                          |
| mid-shaft          | LOC         | ❌ Incorrect | Anatomy location, not generic location                       |
| Methadone          | ORG         | ❌ Incorrect | Medication misclassified                                     |
| C7                 | ORG         | ❌ Incorrect | Cervical vertebra (anatomy)                                  |
| 3 cm               | QUANTITY    | ✔️ Correct   | Correct                                                       |
| 4x4s               | PRODUCT     | ✔️ Correct-ish | Medical sponge size                                        |
| 80%                | PERCENT     | ✔️ Correct   |                                                              |
| 41,900             | QUANTITY    | ✔️ Correct   | Lab value                                                     |
| 56.7               | QUANTITY    | ✔️ Correct   |                                                              |
| 235,000            | QUANTITY    | ✔️ Correct   |                                                              |
| 61.8 kg            | QUANTITY    | ✔️ Correct   |                                                              |
| L5                 | ORG         | ❌ Incorrect | Spinal anatomy                                               |
| C5-6               | ORG         | ❌ Incorrect | Spinal anatomy                                               |
| 10 days            | DATE        | ✔️ Correct   |                                                              |
| 32-French          | PRODUCT     | ✔️ Correct-ish | Catheter size                                             |
| 7                  | CARDINAL    | ✔️ Correct   | Number                                                       |
| Mediastinal        | ORG         | ❌ Incorrect | Anatomy/anatomical region                                    |
| right middle lobe  | ORG         | ❌ Incorrect | Anatomy                                                      |
| BACITRACIN         | PERSON      | ❌ Incorrect | Medication misclassified                                     |

- Diseases frequently mislabeled as PERSON (e.g., "Diverticulosis", "Diverticulitis")
- Procedures mislabeled as ORG or FAC ("Colonoscopy", "LV gram")
- Anatomy mislabeled as ORG, FAC, or LOC ("C5-6", "L1 to S2", "right forearm")
- Medications mislabeled as ORG or even PERSON ("Methadone", "Bacitracin")
- Device names sometimes reasonably labeled as PRODUCT
- Measurements correctly labeled most of the time (QUANTITY, PERCENT)
- Dates and times are correctly identified

---
The spaCy general purpose NER model performs poorly on medical entities.
Most errors fall into the following categories:
1.	Anatomy mislabeled as ORG, FAC, or LOC
2.	Diseases mislabeled as PERSON
3.	Procedures mislabeled as ORG
4.	Medications mislabeled as ORG or PERSON
5.	Lab values mostly correct (QUANTITY)
6.	Dates and measurements generally correct

Overall, the manual accuracy on the 100-entity sample is approximately:
- Correct: ≈ 25–30%
- Incorrect: ≈ 70–75%

This shows that spaCy’s pre-trained NER is not suitable for medical text and motivates custom medical NER fine-tuning

# 4. Extend NER with custom entity types (NER Annotator)

In [None]:
sample_texts = df['text'].sample(10, random_state=42)
sample_texts.to_csv('../data/samples/to_annotate.txt', index=False)

### NER Annotator: https://arunmozhi.in/ner-annotator/

In [None]:
import sys, os
sys.path.append(os.path.abspath(".."))

from src.ner import train_custom_ner, load_annotated_json

In [None]:
db = load_annotated_json("../data/annotated/annotations.json")
db.to_disk("train.spacy")

In [None]:
labels = ["DISEASE", "MEDICATION", "SYMPTOM", "PROCEDURE", "ANATOMY", "LAB_VALUE"]

output_dir = "../data/models/custom_ner"

nlp_custom = train_custom_ner("train.spacy",output_dir, labels, n_iter=30)

In [16]:
import spacy

nlp_custom = spacy.load("../data/models/custom_ner")

texts = df['text'].tolist()
ents_list = []
for doc in tqdm(nlp_custom.pipe(texts, batch_size=32, n_process=multiprocessing.cpu_count()), total=len(texts)):
    ents_list.append([(ent.text, ent.label_) for ent in doc.ents])


df['custom_ents'] = ents_list

100%|██████████| 4966/4966 [00:32<00:00, 151.64it/s]


In [17]:
# compute entity frequencies
from collections import Counter

ent_counter = Counter()
for ents in df['custom_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

[('ANATOMY', 50426),
 ('PROCEDURE', 36612),
 ('DISEASE', 18117),
 ('MEDICATION', 2064),
 ('SYMPTOM', 639),
 ('LAB_VALUE', 492)]

In [18]:
sample_df = df.sample(100, random_state=42)
sample_df[['text', 'custom_ents']].head()

Unnamed: 0,text,custom_ents
3138,"REASON FOR CONSULTATION: , Thyroid mass diagnosed as papillary carcinoma.,HISTORY OF PRESENT ILLNESS: ,The patient is a 16-year-old young lady, who was referred from the Pediatric Endocrinology D...","[(Thyroid, ANATOMY), (thyroid, ANATOMY), (papillary carcinoma, ANATOMY), (hypothyroidism, DISEASE), (lesion, DISEASE), (head, ANATOMY), (endocrinopathy, DISEASE), (surgical, ANATOMY), (thyroid sur..."
1964,"PREOPERATIVE DIAGNOSIS:, Prior history of neoplastic polyps.,POSTOPERATIVE DIAGNOSIS:, Small rectal polyps/removed and fulgurated.,PREMEDICATIONS:, Prior to the colonoscopy, the patient complai...","[(neoplastic polyps, DISEASE), (rectal polyps, DISEASE), (colonoscopy, ANATOMY), (headache, ANATOMY), (25 mg, DISEASE), (Demerol, MEDICATION), (Demerol, DISEASE), (nausea, ANATOMY), (Phenergan, ME..."
1344,"PROCEDURE PERFORMED: , Esophagogastroduodenoscopy performed in the emergency department.,INDICATION: , Melena, acute upper GI bleed, anemia, and history of cirrhosis and varices.,FINAL IMPRESSION,...","[(Esophagogastroduodenoscopy, PROCEDURE), (Melena, DISEASE), (acute upper GI bleed, DISEASE), (anemia, DISEASE), (cirrhosis, DISEASE), (varices, DISEASE), (stomach, ANATOMY), (fundus, ANATOMY), (E..."
2984,"HISTORY OF PRESENT ILLNESS: , The patient is a 35-year-old woman who reports that on the 30th of October 2008, she had a rupture of her membranes at nine months of pregnancy, and was admitted to h...","[(epidural anesthetic, MEDICATION), (epidural, ANATOMY), (14 to 18 hours, LAB_VALUE), (epidural, ANATOMY), (epidural, PROCEDURE), (extremely sleepy, DISEASE), (delivered, ANATOMY), (Cesarean secti..."
4910,"PREOPERATIVE DIAGNOSIS: ,Carcinoma of the left upper lobe.,PROCEDURES PERFORMED:,1. Bronchoscopy with aspiration.,2. Left upper lobectomy.,PROCEDURE DETAILS: ,With patient in supine position u...","[(Carcinoma, DISEASE), (Bronchoscopy, PROCEDURE), (aspiration, PROCEDURE), (Left upper lobectomy, PROCEDURE), (examine the, ANATOMY), (carina, ANATOMY), (carina, ANATOMY), (midline, ANATOMY), (lob..."


## Evaluate 100 random custom entities

In [None]:
import random
random.seed(42)
all_custom_ents = []
for ents in df["custom_ents"]:
    all_custom_ents.extend(ents)

sample_ents = random.sample(all_custom_ents, 100)

eval_df = pd.DataFrame(sample_ents, columns=["entity", "label"])

#eval_df.to_csv("../data/custom_ner_evaluation/custom_ner_manual_eval.csv", index=False)
eval_df.head(20)


Unnamed: 0,entity,label,correct
0,removed,ANATOMY,
1,pelvis,ANATOMY,
2,knee,ANATOMY,
3,weakness,DISEASE,
4,units mixed,ANATOMY,
5,echocardiogram,PROCEDURE,
6,internal fixation,ANATOMY,
7,dressings,ANATOMY,
8,Acute renal failure,DISEASE,
9,lesion,DISEASE,


In [None]:
df_eval = pd.read_csv("../data/custom_ner_evaluation/custom_ner_manual_eval.csv")

accuracy = df_eval["correct"].mean()

print("Custom NER Manual Accuracy:", accuracy)

The custom medical NER model clearly outperforms the general-purpose spaCy NER on clinical text. While the baseline model only achieved around 25–30% accuracy in a manual evaluation of 100 random entities, the custom model improved this to about 45%. Most of the remaining errors are due to confusion between symptoms vs. diseases (e.g. “weakness”, “swelling”, “paresthesias”) and some cases where non-entities were labeled as entities.

# 5. Investigate using an LLM-based NER classifier

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [None]:
model_name = 'd4data/biomedical-ner-all'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp_clinical_ner_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

results = []
for text in tqdm(texts):
    ents = nlp_clinical_ner_bert(text)
    results.append([(e['word'], e['entity_group']) for e in ents])
    
df['clinical_bert_ents'] = results


Device set to use mps:0
100%|██████████| 4966/4966 [02:17<00:00, 36.07it/s]


In [12]:
from collections import Counter

ent_counter = Counter()
for ents in df['clinical_bert_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

[('Biological_structure', 43321),
 ('Sign_symptom', 37765),
 ('Therapeutic_procedure', 29608),
 ('Detailed_description', 28105),
 ('Diagnostic_procedure', 22932),
 ('Medication', 14631),
 ('Lab_value', 12118),
 ('Disease_disorder', 9228),
 ('Dosage', 5534),
 ('Clinical_event', 4679),
 ('Nonbiological_location', 4616),
 ('History', 3915),
 ('Age', 3028),
 ('Coreference', 2774),
 ('Severity', 2752),
 ('Sex', 2643),
 ('Duration', 2614),
 ('Date', 2138),
 ('Distance', 1729),
 ('Subject', 1618),
 ('Activity', 1525),
 ('Time', 1212),
 ('Administration', 1127),
 ('Personal_background', 950),
 ('Family_history', 629),
 ('Frequency', 589),
 ('Area', 558),
 ('Occupation', 360),
 ('Other_entity', 243),
 ('Volume', 223),
 ('Quantitative_concept', 208),
 ('Outcome', 184),
 ('Color', 163),
 ('Shape', 56),
 ('Texture', 47),
 ('Other_event', 38),
 ('Qualitative_concept', 18),
 ('Height', 1)]

In [27]:
import random
random.seed(52)
all_custom_ents = []
for ents in df["clinical_bert_ents"]:
    all_custom_ents.extend(ents)

sample_ents = random.sample(all_custom_ents, 100)

eval_df = pd.DataFrame(sample_ents, columns=["entity", "label"])

eval_df.to_csv("../data/LLM_based_NER_evaluation/LLM_based_ner_manual_eval.csv", index=False)
eval_df.head(20)

Unnamed: 0,entity,label
0,thigh,Detailed_description
1,an,Diagnostic_procedure
2,##iness,Sign_symptom
3,moderate,Severity
4,male,Sex
5,issues,Sign_symptom
6,draped,Therapeutic_procedure
7,posterior,Biological_structure
8,cyst,Disease_disorder
9,con,Sign_symptom


In [29]:
# From https://huggingface.co/Helios9/BioMed_NER

def merge_consecutive_entities(entities, text):
    entities = sorted(entities, key=lambda x: x['start'])
    merged_entities = []
    current_entity = None

    for entity in entities:
        if current_entity is None:
            current_entity = entity
        elif (
            entity['entity_group'] == current_entity['entity_group'] and
            (entity['start'] <= current_entity['end'])
        ):
            # Merge based on start and end positions in the text
            current_entity['end'] = max(current_entity['end'], entity['end'])
            current_entity['word'] = text[current_entity['start']:current_entity['end']]
            current_entity['score'] = (current_entity['score'] + entity['score']) / 2  
        else:
            merged_entities.append(current_entity)
            current_entity = entity
    if current_entity:
        merged_entities.append(current_entity)

    return merged_entities

In [30]:
model_name = 'Helios9/BioMed_NER'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp_clinical_ner_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

results = []
for text in tqdm(texts):
    result = nlp_clinical_ner_bert(text)
    final_result=merge_consecutive_entities(result,text)
    results.append([(e['word'], e['entity_group']) for e in final_result])
    
df['clinical_bert_ents'] = results

Device set to use mps:0
  0%|          | 0/4966 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  1%|▏         | 68/4966 [00:13<16:33,  4.93it/s]


KeyboardInterrupt: 

In [31]:
results

[[(' Iron deficiency anemia', 'Disease_disorder'),
  (' Diverticulosis', 'Disease_disorder'),
  (' Colonoscopy', 'Diagnostic_procedure'),
  ('MAC', 'Diagnostic_procedure'),
  (' Olympus pediatric variable', 'Detailed_description'),
  (' colonoscope', 'Diagnostic_procedure'),
  (' rectum', 'Biological_structure'),
  ('colon', 'Biological_structure'),
  (' cecum', 'Biological_structure'),
  (' ileocecal valve', 'Biological_structure'),
  (' appendiceal orifice', 'Biological_structure'),
  ('Preparation', 'Diagnostic_procedure'),
  ('good', 'Lab_value'),
  ('material', 'Sign_symptom'),
  (' cecum', 'Biological_structure'),
  ('mucosa', 'Diagnostic_procedure'),
  ('normal', 'Lab_value'),
  ('colon', 'Biological_structure'),
  ('polyps', 'Sign_symptom'),
  ('lesions', 'Sign_symptom'),
  ('blood', 'Sign_symptom'),
  ('diverticula', 'Sign_symptom'),
  (' sigmoid colon', 'Biological_structure'),
  ('luminal narrowing', 'Sign_symptom'),
  ('inflammation', 'Sign_symptom'),
  (' retroflex', 'Deta