In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 200)

df = pd.read_parquet("hf://datasets/argilla/medical-domain/data/train-00000-of-00001-67e4e7207342a623.parquet")

def extract_label(pred):
    if isinstance(pred, (list, np.ndarray)) and len(pred) > 0 and isinstance(pred[0], dict):
        return pred[0].get("label")
    return None

df['label'] = df['prediction'].apply(extract_label)
df['text_length'] = df['metrics'].apply(lambda x: x.get('text_length') if isinstance(x, dict) else None)

# drop empty columns
df = df.drop(columns=['inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'metadata', 'status', 'event_timestamp', 'metrics'], errors='ignore')

# 1. Investigate which NER types appear (manual inspection)

In [2]:
# df['text'].sample(20).tolist()

After manually inspecting 20 randomly sampled clinical notes, the following types of named entities appear frequently and consistently throughout the dataset:

Core Medical Entity Types
1.	DISEASE / CONDITION: "fracture", "polycythemia vera", "pneumonia", "multiple sclerosis", "otitis media"

2.	PROCEDURE / SURGERY: "colonoscopy", "laparoscopy", "arthroscopy", "right middle lobectomy", "heart catheterization"
3.	ANATOMY / BODY PART: "radius and ulna", "left shin", "rotator cuff", "middle lobe", "cervical spine"
4.	MEDICATION: "methadone", "aspirin prophylaxis", "prednisone", "amoxicillin", "Zithromax"
5.	LAB VALUE / MEASUREMENT: "CBC 41,900", "CRP 6.7", "BP 144/85", "weight 61.8 kg", "temperature 99.5°F"
6.	SYMPTOM / FINDING: "pain", "swelling", "wheezing", "fatigue", "rash", "tenderness"

Conclusion:
The dataset is rich in medical terminology, with DISEASE, PROCEDURE, ANATOMY, MEDICATION, LAB_VALUE, and SYMPTOM being the most prominent NER categories. These will be used to define the custom medical NER schema in the next steps.

In [3]:
# Quantitative support to manual inspection
import re
from collections import Counter

keywords = ["fracture", "pneumonia", "colonoscopy", "laparoscopy", "methadone", "aspirin", "pain", "swelling","biopsy", "ultrasound", "mri", "ct scan", "x-ray", "tumor", "cancer"]
Counter({kw: df['text'].str.contains(kw, case=False).sum() for kw in keywords})

Counter({'pain': np.int64(2059),
         'x-ray': np.int64(698),
         'mri': np.int64(529),
         'cancer': np.int64(464),
         'biopsy': np.int64(437),
         'swelling': np.int64(429),
         'fracture': np.int64(404),
         'ultrasound': np.int64(354),
         'ct scan': np.int64(326),
         'tumor': np.int64(325),
         'aspirin': np.int64(315),
         'pneumonia': np.int64(223),
         'colonoscopy': np.int64(156),
         'laparoscopy': np.int64(76),
         'methadone': np.int64(32)})

# 2. Apply spaCy’s standard NER classifier

In [4]:
from tqdm import tqdm
import spacy
import subprocess
import sys
import multiprocessing
import time

model_name = "en_core_web_md"
try:
    nlp = spacy.load(model_name)
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    nlp = spacy.load(model_name)

texts = df['text'].tolist()
start_time = time.time()
ents_list = []
for doc in tqdm(nlp.pipe(texts, batch_size=32, n_process=multiprocessing.cpu_count()), total=len(texts)):
    ents_list.append([(ent.text, ent.label_) for ent in doc.ents])
end_time = time.time()
print(f"NER processing took {end_time - start_time:.2f} seconds")

df['spacy_ents'] = ents_list

100%|██████████| 4966/4966 [01:47<00:00, 46.04it/s] 

NER processing took 107.90 seconds





In [5]:
df[['text', 'spacy_ents']].head()

Unnamed: 0,text,spacy_ents
0,"PREOPERATIVE DIAGNOSIS:, Iron deficiency anemia.,POSTOPERATIVE DIAGNOSIS:, Diverticulosis.,PROCEDURE:, Colonoscopy.,MEDICATIONS: , MAC.,PROCEDURE: , The Olympus pediatric variable colonoscope w...","[(Iron, ORG), (Diverticulosis, PERSON), (Colonoscopy, ORG), (MAC.,PROCEDURE, GPE), (Olympus, ORG), (retroflex, NORP), (Diverticulosis, PERSON), (2 years, DATE)]"
1,"CLINICAL INDICATION: ,Normal stress test.,PROCEDURES PERFORMED:,1. Left heart cath.,2. Selective coronary angiography.,3. LV gram.,4. Right femoral arteriogram.,5. Mynx closure device.,PROCE...","[(LV gram, PERSON), (Mynx, ORG), (2%, PERCENT), (6-French, QUANTITY), (6-French JL4, MONEY), (6-French 3DRC, QUANTITY), (6-French, QUANTITY), (Post LV gram, FAC), (Mynx, ORG), (LVEDP, ORG), (9, DA..."
2,"FINDINGS:,Axial scans were performed from L1 to S2 and reformatted images were obtained in the sagittal and coronal planes.,Preliminary scout film demonstrates anterior end plate spondylosis at T1...","[(L1 to S2, FAC), (T11-12, ORG), (T12-L1.,L1-2, ORG), (4.6mm, QUANTITY), (AP, ORG), (#25).,L4-5, MONEY)]"
3,"PREOPERATIVE DIAGNOSIS: , Blood loss anemia.,POSTOPERATIVE DIAGNOSES:,1. Diverticulosis coli.,2. Internal hemorrhoids.,3. Poor prep.,PROCEDURE PERFORMED:, Colonoscopy with photos.,ANESTHESIA: ...","[(DIAGNOSES:,1, ORG), (Diverticulosis, PERSON), (Conscious, ORG), (Anesthesia, PERSON), (85-year-old, DATE), (EGD, ORG), (the Endoscopy Suite, ORG), (the Anesthesia Department, ORG)]"
4,"REASON FOR VISIT: ,Elevated PSA with nocturia and occasional daytime frequency.,HISTORY: , A 68-year-old male with a history of frequency and some outlet obstructive issues along with irritative ...","[(nocturia, ORG), (68-year-old, DATE), (PSA, ORG), (PSA, ORG), (2004, DATE), (5.5, DATE), (2003, DATE), (Dr. X, PERSON), (1.6, CARDINAL), (Proscar, PERSON), (Proscar, PERSON), (greater than five y..."


**Note**:

spaCy's general English model is not specialized for clinical NER tasks. For example, "Iron" -> ORG, "Diverticulosis" -> PERSON, "Colonoscopy" -> ORG.

In [6]:
# compute entity frequencies
from collections import Counter

ent_counter = Counter()
for ents in df['spacy_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

[('ORG', 31211),
 ('CARDINAL', 28793),
 ('DATE', 19170),
 ('PERSON', 15137),
 ('QUANTITY', 10040),
 ('GPE', 4631),
 ('TIME', 4325),
 ('PRODUCT', 3739),
 ('ORDINAL', 3462),
 ('PERCENT', 3192),
 ('NORP', 2729),
 ('MONEY', 2283),
 ('LOC', 595),
 ('FAC', 540),
 ('LAW', 371),
 ('EVENT', 308),
 ('WORK_OF_ART', 194),
 ('LANGUAGE', 84)]

# 3. Evaluate spaCy NER (automatic + manual)

### 3.1 Manual Evaluation of spaCy NER (100 Entities)

In [7]:
sample_df = df.sample(100, random_state=42)
sample_df[['spacy_ents']].head()

Unnamed: 0,spacy_ents
3138,"[(16-year-old, DATE), (the Pediatric Endocrinology Department, ORG), (first, ORDINAL), (about 2004, DATE), (the Pediatric Endocrinology Department, ORG), (zero, CARDINAL), (Tijuana, GPE), (Mexico,..."
1964,"[(25 mg, QUANTITY), (Demerol, ORG), (the IV Demerol, ORG), (25 mg, QUANTITY), (Phenergan IV, GPE), (7.5 mg, QUANTITY), (Digital, ORG), (P160, PRODUCT), (30 cm, QUANTITY), (five, CARDINAL), (One, C..."
1344,"[(Esophagogastroduodenoscopy, ORG), (Melena, PERSON), (GI, ORG), (IMPRESSION,1, ORG), (Repeat EGD, PERSON), (tomorrow, DATE), (morning, TIME), (ICU, ORG), (100, CARDINAL), (EGD, ORG), (An addition..."
2984,"[(35-year-old, DATE), (the 30th of October 2008, DATE), (nine months, DATE), (approximately 14 to 18 hours, DATE), (the 31st of October, DATE), (Foley, PERSON), (the 1st of November 2008, DATE), (..."
4910,"[(Bronchoscopy, ORG), (Foley, PERSON), (Betadine, NORP), (Hemostasis, PERSON), (sixth, ORDINAL), (sixth, ORDINAL), (3 cm, QUANTITY), (#00, MONEY), (Potts, PERSON), (Direction, FAC), (000, MONEY), ..."


We sampled 100 random entities from the model output and evaluated whether each prediction is correct in the medical context.

| Entity | spaCy Label | Correct? | Expected Medical Category | Comment |
|--------|-------------|----------|----------------------------|---------|
| Esophagogastroduodenoscopy | ORG | ❌ | PROCEDURE | Misread as organization |
| Melena | PERSON | ❌ | SYMPTOM / FINDING | Disease labeled as person |
| Demerol | ORG | ❌ | MEDICATION | Drug interpreted as an organization |
| Betadine | NORP | ❌ | MEDICATION / ANTISEPTIC | Not a nationality/group |
| CT Abdomen & Pelvis | ORG | ❌ | IMAGING PROCEDURE | Imaging test mislabeled |
| Foley | PERSON | ❌ | DEVICE / CATHETER | Mistaken as a person |
| L4–L5 | ORG | ❌ | ANATOMY | Vertebral level mislabeled |
| 3.4 cm | QUANTITY | ✔️ | MEASUREMENT | Correct |
| 12-mm trocar | QUANTITY | ✔️ | DEVICE SIZE | Acceptable |
| 05/26/1999 | DATE | ✔️ | DATE | Correct |
| Pediatric Endocrinology Dept | ORG | ✔️ | DEPARTMENT | Acceptable |
| Tijuana | GPE | ✔️ | LOCATION | Correct |
| Isovue-300 | LOC | ❌ | CONTRAST AGENT | Incorrect category |
| ICU | ORG | ❌ | LOCATION / UNIT | Hospital units ≠ organizations |
| Bronchoscopy | ORG | ❌ | PROCEDURE | Misread as organization |
| Hemostasis | PERSON | ❌ | PROCEDURE / ACTION | Labeled as a person |
| Potts | PERSON | ✔️ | SURGICAL INSTRUMENT | Label acceptable but misleading |
| 100 mL | QUANTITY | ✔️ | DOSAGE/VOLUME | Correct |
| 2.5 cm | QUANTITY | ✔️ | MEASUREMENT | Correct |
| hemiscrotum | ORG | ❌ | ANATOMY | Body part labeled as ORG |
| Electrocautery | PERSON | ❌ | PROCEDURE / DEVICE | Not a person |
| 1.72 | CARDINAL | ✔️ | NUMERIC VALUE | OK |
| Soft | PERSON | ❌ | EXAM FINDING | Not a person |
| Hydrochlorothiazide | ORG | ❌ | MEDICATION | Drug mislabeled |
| Lisinopril | ORG | ❌ | MEDICATION | Drug mislabeled |
| Percocet | ORG | ❌ | MEDICATION | Drug mislabeled |
| Nontender | PERSON | ❌ | EXAM FINDING | Not a person |
| Marcaine | PERSON | ❌ | MEDICATION | Not a person |
| Veress | PRODUCT | ✔️ | SURGICAL DEVICE | Acceptable |
| 12-mm VersaStep | QUANTITY | ✔️ | DEVICE SIZE | OK |
| Appendix | NORP | ❌ | ANATOMY | Not a nationality |
| Endocatch | ORG | ❌ | SURGICAL DEVICE | Incorrect |
| ABCD General Hospital | ORG | ✔️ | FACILITY | Correct |
| X. | PERSON | ✔️ | PERSON | Correct |
| 1% | PERCENT | ✔️ | MEASUREMENT | Correct |
| Glenn | PERSON | ✔️ | PERSON/PROCEDURE | Acceptable |
| Fontan | PERSON | ❌ | PROCEDURE | Misinterpreted as person |
| Benadryl | ORG | ❌ | MEDICATION | Drug mislabeled |
| 124 pounds | QUANTITY | ✔️ | WEIGHT | Correct |
| 96/54 | CARDINAL | ✔️ | VITAL SIGN | spaCy lacks medical category |
| Phacoemulsification | ORG | ❌ | PROCEDURE | Incorrect |
| Silicone | ORG | ❌ | MATERIAL | Incorrect |
| ABC Laboratories | ORG | ✔️ | COMPANY | Correct |
| EKG | ORG | ❌ | PROCEDURE | Misclassified |
| Jun. | PERSON | ❌ | DATE | Misread as a person |
| Laparoscopic Roux-en-Y | PERSON | ❌ | PROCEDURE | Wrong label |
| EEA | ORG | ❌ | SURGICAL DEVICE | Incorrect |
| Ziac | PERSON | ❌ | MEDICATION | Wrong label |
| Remeron | PERSON | ❌ | MEDICATION | Wrong label |
| Salt Lake City | GPE | ✔️ | LOCATION | Correct |
| Noncontrast CT | ORG | ❌ | IMAGING PROCEDURE | Incorrect |
| Levaquin | PERSON | ❌ | MEDICATION | Incorrect |
| Reglan | FAC | ❌ | MEDICATION | Incorrect |
| Streptococcal | ORG | ❌ | DISEASE / ORGANISM | Incorrect |
| thromboplastin | NORP | ❌ | LAB / PROTEIN | Not nationality |
| Foley catheter | PERSON | ❌ | DEVICE | Wrong label |
| Veress needle | PRODUCT | ✔️ | DEVICE | Correct |
| 16-French | QUANTITY | ✔️ | DEVICE SIZE | Correct |
| concha bullosa | PERSON | ❌ | ANATOMICAL FINDING | Wrong label |
| ENT | ORG | ✔️ | SPECIALTY | Acceptable |
| CT Abdomen & Pelvis W/WO | ORG | ❌ | IMAGING | Incorrect |
| Powerade | PERSON | ❌ | SUBSTANCE | Not a person |
| CPK | ORG | ❌ | LAB TEST | Incorrect |
| Kawasaki | PERSON | ❌ | DISEASE | Wrong |
| CRP | ORG | ❌ | LAB TEST | Wrong |
| ESR | ORG | ❌ | LAB TEST | Wrong |
| IVIG | ORG | ❌ | MEDICATION / IMMUNOTHERAPY | Wrong |
| Echocardiogram | ORG | ❌ | IMAGING PROCEDURE | Wrong |
| SMK | ORG | ✔️ | COMPANY / BRAND | Acceptable |
| Depo-Medrol | PRODUCT | ✔️ | MEDICATION | Acceptable |
| Coagulation | ORG | ❌ | LAB / PHYSIOLOGY | Wrong |
| ERCP | ORG | ❌ | PROCEDURE | Wrong |
| Medical Oncology | ORG | ✔️ | DEPARTMENT | Acceptable |
| T7–T8 | PRODUCT | ❌ | ANATOMY | Wrong category |
| Medtronic | ORG | ✔️ | DEVICE COMPANY | Correct |
| Fluoroscopy | PERSON | ❌ | IMAGING PROCEDURE | Wrong |
| T2–L2 | NORP | ❌ | ANATOMY | Not a nationality |
| Jackson | PERSON | ✔️ | PERSON | Correct |
| Thecal sac | ORG | ❌ | ANATOMY | Wrong |
| 5.5 | CARDINAL | ✔️ | VALUE | Correct |
| 20 mg | QUANTITY | ✔️ | DOSAGE | Correct |
| 30 cm | QUANTITY | ✔️ | MEASUREMENT | Correct |
| One | CARDINAL | ✔️ | NUMBER | Correct |
| 000 | MONEY | ❌ | NONE | Misinterpreted as money |
| Direction | FAC | ❌ | NONE | Wrong label |
| Soft tissue | PERSON | ❌ | ANATOMY | Misinterpreted |
| Renal | ORG | ❌ | ANATOMY / ADJECTIVE | Incorrect |
| 47-year-old | DATE | ❌ | AGE | Age ≠ Date |
| daily | DATE | ❌ | FREQUENCY | Not a date |
| five days | DATE | ✔️ | TEMPORAL | Acceptable |
| Appendectomy | ORG | ❌ | PROCEDURE | Wrong |
| Benign tumor | ORG | ❌ | DIAGNOSIS | Wrong |
| L5-S1 | ORG | ❌ | ANATOMY | Wrong |
| Marcaine injection | PERSON | ❌ | PROCEDURE | Wrong |
| Hemostasis achieved | PERSON | ❌ | PROCEDURE STEP | Wrong |
| 10 cm incision | QUANTITY | ✔️ | MEASUREMENT | Correct |
| 200 mg | QUANTITY | ✔️ | DOSAGE | Correct |
| 50-year-old | DATE | ❌ | AGE | Wrong label |
| Many years | DATE | ✔️ | TEMPORAL | Correct |
| 14 months | DATE | ✔️ | AGE | Acceptable |

**What spaCy Does Well**
- Dates and temporal expressions (e.g., “05/26/1999”, “five days”)
- Quantities and measurements (cm, mg, %, weights)
- Person names (e.g., "Jackson")
- Some organizations and locations

**What spaCy Does Poorly**
- Medications often -> ORG or PERSON  
- Procedures often -> ORG  
- Anatomy -> ORG, NORP, or PERSON  
- Devices -> PERSON  
- Lab tests -> ORG  
- Diseases -> PERSON  
- No medical categories (MEDICATION, PROCEDURE, ANATOMY, LAB, etc.)

**Overall Quality (100 entities)**
- Correct: ~30%
- Incorrect: ~70%

spaCy’s general English model performs poorly on clinical notes.
It lacks domain knowledge and mislabels most medical entities.

### 3.2 Automatic Evaluation

Since we do not have gold standard NER annotations, we cannot do automatic evaluation like precision/recall/F1. Custom annotations will be created in the next steps for proper evaluation.

# 4. Extend NER with custom entity types (NER Annotator)

To overcome the limitations of spaCy’s general-purpose NER model on clinical text, we created a custom medical NER system using six domain-specific labels:

DISEASE, MEDICATION, SYMPTOM, PROCEDURE, ANATOMY, LAB_VALUE

### 4.1 Manual Annotation Using NER Annotator

We manually annotated clinical reports using the online NER Annotator tool. https://arunmozhi.in/ner-annotator/

The resulting annotations.json file contains:
- ~12 annotated documents
- Several thousand labeled entities across all classes
- Clean character-based spans compatible with spaCy

Example annotation:
```json
{
  "classes": ["DISEASE", "MEDICATION", "SYMPTOM", ...],
  "annotations": [
    ["<text 1>", {"entities": [[start, end, label], ...]}],
    ["<text 2>", {"entities": [...]}],
    ...
  ]
}
```

In [8]:
sample_texts = df['text'].sample(10, random_state=42)
sample_texts.to_csv('../data/samples/to_annotate.txt', index=False)

### 4.2 Converting JSON to spaCy’s DocBin Format

In [9]:
import sys, os
sys.path.append(os.path.abspath(".."))

from src.ner import train_custom_ner, load_annotated_json

In [10]:
# Convert this JSON into spaCy’s DocBin format:
db = load_annotated_json("../data/annotated/annotations.json")
db.to_disk("train.spacy")

### 4.2 Training the Custom NER Model

In [11]:
# 6 custom labels and train a blank eglish model from scratch
labels = ["DISEASE", "MEDICATION", "SYMPTOM", "PROCEDURE", "ANATOMY", "LAB_VALUE"]

output_dir = "../data/models/custom_ner"

nlp_custom = train_custom_ner("train.spacy",output_dir, labels, n_iter=30)

Iteration 1, Losses: {'ner': np.float32(7577.8975)}
Iteration 2, Losses: {'ner': np.float32(7022.537)}
Iteration 3, Losses: {'ner': np.float32(3943.5583)}
Iteration 4, Losses: {'ner': np.float32(1503.7443)}
Iteration 5, Losses: {'ner': np.float32(1403.2487)}
Iteration 6, Losses: {'ner': np.float32(1320.5245)}
Iteration 7, Losses: {'ner': np.float32(1162.0874)}
Iteration 8, Losses: {'ner': np.float32(1173.6781)}
Iteration 9, Losses: {'ner': np.float32(1084.098)}
Iteration 10, Losses: {'ner': np.float32(1077.1304)}
Iteration 11, Losses: {'ner': np.float32(1012.0186)}
Iteration 12, Losses: {'ner': np.float32(969.8941)}
Iteration 13, Losses: {'ner': np.float32(942.35364)}
Iteration 14, Losses: {'ner': np.float32(900.80725)}
Iteration 15, Losses: {'ner': np.float32(848.2076)}
Iteration 16, Losses: {'ner': np.float32(799.437)}
Iteration 17, Losses: {'ner': np.float32(794.9763)}
Iteration 18, Losses: {'ner': np.float32(756.0616)}
Iteration 19, Losses: {'ner': np.float32(736.13043)}
Iteration 

### 4.4 Applying the Custom Model to All Documents

In [12]:
import spacy
import time

nlp_custom = spacy.load("../data/models/custom_ner")

texts = df['text'].tolist()
start_time = time.time()
ents_list = []
for doc in tqdm(nlp_custom.pipe(texts, batch_size=32, n_process=multiprocessing.cpu_count()), total=len(texts)):
    ents_list.append([(ent.text, ent.label_) for ent in doc.ents])

end_time = time.time()
print(f"NER processing time: {end_time - start_time} seconds")
df['custom_ents'] = ents_list

100%|██████████| 4966/4966 [00:41<00:00, 120.26it/s]

NER processing time: 41.29432010650635 seconds





In [13]:
# compute entity frequencies
from collections import Counter

ent_counter = Counter()
for ents in df['custom_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

[('ANATOMY', 68662),
 ('PROCEDURE', 48211),
 ('DISEASE', 28924),
 ('MEDICATION', 3168),
 ('LAB_VALUE', 2904),
 ('SYMPTOM', 1861)]

**Note**
- surgical and radiology notes have a lot of ANATOMY and PROCEDURE entities.
- MEDICATION is comparatively rare.

In [14]:
sample_df = df.sample(100, random_state=42)
sample_df[['text', 'custom_ents']].head()

Unnamed: 0,text,custom_ents
3138,"REASON FOR CONSULTATION: , Thyroid mass diagnosed as papillary carcinoma.,HISTORY OF PRESENT ILLNESS: ,The patient is a 16-year-old young lady, who was referred from the Pediatric Endocrinology D...","[(Thyroid, ANATOMY), (papillary carcinoma, DISEASE), (thyroid, ANATOMY), (papillary carcinoma, ANATOMY), (thyroid, ANATOMY), (papillary thyroid, ANATOMY), (hypothyroidism, DISEASE), (thyroid cance..."
1964,"PREOPERATIVE DIAGNOSIS:, Prior history of neoplastic polyps.,POSTOPERATIVE DIAGNOSIS:, Small rectal polyps/removed and fulgurated.,PREMEDICATIONS:, Prior to the colonoscopy, the patient complai...","[(neoplastic polyps, DISEASE), (rectal polyps, DISEASE), (colonoscopy, ANATOMY), (headache, DISEASE), (25 mg, LAB_VALUE), (Demerol, MEDICATION), (Demerol, MEDICATION), (nausea, MEDICATION), (Phene..."
1344,"PROCEDURE PERFORMED: , Esophagogastroduodenoscopy performed in the emergency department.,INDICATION: , Melena, acute upper GI bleed, anemia, and history of cirrhosis and varices.,FINAL IMPRESSION,...","[(Esophagogastroduodenoscopy, ANATOMY), (Melena, DISEASE), (acute upper GI bleed, ANATOMY), (anemia, DISEASE), (cirrhosis, DISEASE), (varices, DISEASE), (stomach, ANATOMY), (fundus, ANATOMY), (End..."
2984,"HISTORY OF PRESENT ILLNESS: , The patient is a 35-year-old woman who reports that on the 30th of October 2008, she had a rupture of her membranes at nine months of pregnancy, and was admitted to h...","[(epidural anesthetic, MEDICATION), (epidural, ANATOMY), (14 to 18 hours, LAB_VALUE), (epidural, ANATOMY), (epidural, ANATOMY), (delivered, ANATOMY), (Cesarean section, PROCEDURE), (failed, MEDICA..."
4910,"PREOPERATIVE DIAGNOSIS: ,Carcinoma of the left upper lobe.,PROCEDURES PERFORMED:,1. Bronchoscopy with aspiration.,2. Left upper lobectomy.,PROCEDURE DETAILS: ,With patient in supine position u...","[(Carcinoma, ANATOMY), (left upper, ANATOMY), (Bronchoscopy, ANATOMY), (aspiration, PROCEDURE), (Left upper lobectomy, PROCEDURE), (placed, ANATOMY), (examine, ANATOMY), (carina, ANATOMY), (carina..."


### 4.5 Evaluate 100 random custom entities

In [None]:
import random
random.seed(42)
all_custom_ents = []
for ents in df["custom_ents"]:
    all_custom_ents.extend(ents)

sample_ents = random.sample(all_custom_ents, 100)

eval_df = pd.DataFrame(sample_ents, columns=["entity", "label"])

# To save for manual evaluation
# eval_df.to_csv("../data/custom_ner_evaluation/custom_ner_manual_to_eval.csv", index=False)
# eval_df.head(20)


Unnamed: 0,entity,label
0,dissection,PROCEDURE
1,exchange,ANATOMY
2,heart,ANATOMY
3,bleeding,PROCEDURE
4,knee arthroplasty,ANATOMY
5,Behavioral Health,PROCEDURE
6,removed,PROCEDURE
7,legs,ANATOMY
8,subcutaneous tissue,ANATOMY
9,11 days,LAB_VALUE


In [None]:
# To annotate entities for evaluation
# from src.ner import annotate_entities
# annotate_entities()


--------------------------------------------
Entity: dissection
Label : PROCEDURE

--------------------------------------------
Entity: exchange
Label : ANATOMY

--------------------------------------------
Entity: heart
Label : ANATOMY

--------------------------------------------
Entity: bleeding
Label : PROCEDURE

--------------------------------------------
Entity: knee arthroplasty
Label : ANATOMY

--------------------------------------------
Entity: Behavioral Health
Label : PROCEDURE

--------------------------------------------
Entity: removed
Label : PROCEDURE

--------------------------------------------
Entity: legs
Label : ANATOMY

--------------------------------------------
Entity: subcutaneous tissue
Label : ANATOMY

--------------------------------------------
Entity: 11 days
Label : LAB_VALUE

--------------------------------------------
Entity: layers
Label : ANATOMY

--------------------------------------------
Entity: edema
Label : DISEASE

------------------------

Unnamed: 0,entity,label,correct
0,dissection,PROCEDURE,1
1,exchange,ANATOMY,0
2,heart,ANATOMY,1
3,bleeding,PROCEDURE,0
4,knee arthroplasty,ANATOMY,1
...,...,...,...
95,AST,SYMPTOM,1
96,rhythm,PROCEDURE,1
97,placed,PROCEDURE,1
98,lateral pedicles,ANATOMY,1


In [23]:
df_eval = pd.read_csv("../data/custom_ner_evaluation/custom_ner_manual_evaluated.csv")

accuracy = df_eval["correct"].mean()

print("Custom NER Manual Accuracy:", accuracy)

Custom NER Manual Accuracy: 0.43


The custom medical NER model clearly outperforms the general-purpose spaCy NER on clinical text. While the baseline model only achieved around 30% accuracy in a manual evaluation of 100 random entities, the custom model improved this to about 45%. Most of the remaining errors are due to confusion between symptoms vs. diseases (e.g. “weakness”, “swelling”, “paresthesias”) and some cases where non-entities were labeled as entities.

# 5. Investigate using an LLM-based NER classifier

In [42]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

local_path = "../models/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModelForTokenClassification.from_pretrained(local_path)

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = "mps"
else:
    device = "cpu"

model.to(device)

results = []
for text in tqdm(texts):
    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    ).to(device)

    with torch.no_grad():
        logits = model(**encoding).logits

    predictions = torch.argmax(logits, dim=2)[0].cpu().numpy()
    tokens = encoding.tokens()

    results.append(list(zip(tokens, predictions)))

KeyboardInterrupt: 

In [43]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

local_path = "../models/biomedical-ner-all"

tokenizer = AutoTokenizer.from_pretrained(local_path)
model = AutoModelForTokenClassification.from_pretrained(local_path)

nlp_clinical_ner_bert = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

Device set to use mps:0


In [37]:
from tqdm.auto import tqdm
import time

results = []
start_time = time.time()
for text in tqdm(texts):
    ents = nlp_clinical_ner_bert(text, batch_size=16)
    results.append([(e['word'], e['entity_group']) for e in ents])
end_time = time.time()
print(f"NER processing time: {end_time - start_time} seconds")
df['clinical_bert_ents'] = results

KeyboardInterrupt: 

In [44]:
texts = df["text"].tolist()
results = []

batch_size = 16
batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]

import time
from tqdm.auto import tqdm

start = time.time()
for batch in tqdm(batches):
    batch_output = nlp_clinical_ner_bert(batch)
    for output in batch_output:
        results.append([(e["word"], e["entity_group"]) for e in output])
end = time.time()

df["clinical_bert_ents"] = results
print("Total time:", end - start)
print("Average per document:", (end - start) / len(texts))

KeyboardInterrupt: 

In [None]:
from collections import Counter

ent_counter = Counter()
for ents in df['clinical_bert_ents']:
    for _, label in ents:
        ent_counter[label] += 1

ent_counter.most_common()

In [None]:
import random
random.seed(52)
all_custom_ents = []
for ents in df["clinical_bert_ents"]:
    all_custom_ents.extend(ents)

sample_ents = random.sample(all_custom_ents, 100)

eval_df = pd.DataFrame(sample_ents, columns=["entity", "label"])

eval_df.to_csv("../data/LLM_based_NER_evaluation/LLM_based_ner_manual_eval.csv", index=False)
eval_df.head(20)

In [None]:
# From https://huggingface.co/Helios9/BioMed_NER

def merge_consecutive_entities(entities, text):
    entities = sorted(entities, key=lambda x: x['start'])
    merged_entities = []
    current_entity = None

    for entity in entities:
        if current_entity is None:
            current_entity = entity
        elif (
            entity['entity_group'] == current_entity['entity_group'] and
            (entity['start'] <= current_entity['end'])
        ):
            # Merge based on start and end positions in the text
            current_entity['end'] = max(current_entity['end'], entity['end'])
            current_entity['word'] = text[current_entity['start']:current_entity['end']]
            current_entity['score'] = (current_entity['score'] + entity['score']) / 2  
        else:
            merged_entities.append(current_entity)
            current_entity = entity
    if current_entity:
        merged_entities.append(current_entity)

    return merged_entities

In [None]:
model_name = 'Helios9/BioMed_NER'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp_clinical_ner_bert = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

results = []
for text in tqdm(texts):
    result = nlp_clinical_ner_bert(text)
    final_result=merge_consecutive_entities(result,text)
    results.append([(e['word'], e['entity_group']) for e in final_result])
    
df['clinical_bert_ents'] = results

In [None]:
results