In [None]:
!pip install transformers datasets torch gradio




In [None]:
from google.colab import files
uploaded = files.upload()


Saving best_model.zip to best_model.zip


In [None]:
import zipfile, os
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Match the uploaded filename
zip_path = "best_model.zip"   # <-- notice the (1)
extract_dir = "best_model"

# Extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Load
model_path = extract_dir
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

id2label = model.config.id2label
print("Loaded labels:", id2label)


Loaded labels: {0: 'B-Activity', 1: 'B-Administration', 2: 'B-Age', 3: 'B-Area', 4: 'B-Biological_structure', 5: 'B-Clinical_event', 6: 'B-Color', 7: 'B-Coreference', 8: 'B-Date', 9: 'B-Detailed_description', 10: 'B-Diagnostic_procedure', 11: 'B-Disease_disorder', 12: 'B-Distance', 13: 'B-Dosage', 14: 'B-Duration', 15: 'B-Family_history', 16: 'B-Frequency', 17: 'B-Height', 18: 'B-History', 19: 'B-Lab_value', 20: 'B-Medication', 21: 'B-Nonbiological_location', 22: 'B-Occupation', 23: 'B-Other_entity', 24: 'B-Other_event', 25: 'B-Outcome', 26: 'B-Personal_background', 27: 'B-Qualitative_concept', 28: 'B-Quantitative_concept', 29: 'B-Severity', 30: 'B-Sex', 31: 'B-Shape', 32: 'B-Sign_symptom', 33: 'B-Subject', 34: 'B-Texture', 35: 'B-Therapeutic_procedure', 36: 'B-Time', 37: 'B-Volume', 38: 'I-Activity', 39: 'I-Administration', 40: 'I-Age', 41: 'I-Area', 42: 'I-Biological_structure', 43: 'I-Clinical_event', 44: 'I-Color', 45: 'I-Coreference', 46: 'I-Date', 47: 'I-Detailed_description', 48

In [None]:
from datasets import load_dataset

# load full dataset
raw = load_dataset("AGBonnet/augmented-clinical-notes")

# sample exactly 200 notes
sample = raw["train"].shuffle(seed=42).select(range(200))

# split 100 / 100
train_notes = sample.select(range(100))
test_notes  = sample.select(range(100, 200))

print(len(train_notes), len(test_notes))  # should print (100, 100)


100 100


In [None]:
import torch
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def _is_special(tok: str) -> bool:
    return tok in ("[CLS]", "[SEP]", "[PAD]")

def _merge_wordpieces(tokens):
    """return a list of (word, token_idxs) where word merges ##pieces."""
    words, curr, idxs = [], "", []
    for i, t in enumerate(tokens):
        if _is_special(t):
            # flush current if any
            if curr:
                words.append((curr, idxs))
                curr, idxs = "", []
            continue
        if t.startswith("##"):
            curr += t[2:]
            idxs.append(i)
        else:
            if curr:
                words.append((curr, idxs))
            curr, idxs = t, [i]
    if curr:
        words.append((curr, idxs))
    return words

def _bio_to_spans(words, word_labels):
    """
    words: list[str] (merged words)
    word_labels: list[str] same length as words
    returns list of {'label','text','start','end'}
    """
    spans = []
    curr_label, curr_tokens = None, []
    for w, lab in zip(words, word_labels):
        if lab == "O":
            if curr_label:
                spans.append({"label": curr_label,
                              "text": " ".join(curr_tokens),
                              "start": None, "end": None})
                curr_label, curr_tokens = None, []
            continue
        # lab like B-Disease_disorder / I-Disease_disorder
        tag, ent = lab.split("-", 1)
        if tag == "B" or (curr_label and ent != curr_label):
            if curr_label:
                spans.append({"label": curr_label,
                              "text": " ".join(curr_tokens),
                              "start": None, "end": None})
            curr_label = ent
            curr_tokens = [w]
        else:  # I- same entity
            if curr_label is None:
                curr_label = ent
                curr_tokens = [w]
            else:
                curr_tokens.append(w)
    if curr_label:
        spans.append({"label": curr_label,
                      "text": " ".join(curr_tokens),
                      "start": None, "end": None})
    return spans

def predict_entities(text: str):
    """
    returns:
      tokens_labels: list of (token, label) at subword level
      word_spans: list of dicts with 'label' and 'text' (merged)
    """
    # tokenize
    enc = tokenizer(text, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        logits = model(**enc).logits
        pred_ids = torch.argmax(logits, dim=2)[0].tolist()

    token_ids = enc["input_ids"][0].tolist()
    tokens = tokenizer.convert_ids_to_tokens(token_ids)

    # map ids -> BIO labels
    labels = [id2label.get(i, "O") for i in pred_ids]

    # subword-level pairs (for debugging)
    tokens_labels = [(t, l) for t, l in zip(tokens, labels) if not _is_special(t)]

    # merge to word level and carry the majority label per word
    merged = _merge_wordpieces(tokens)
    words = [w for w, idxs in merged]

    # choose the first non-O label among pieces for each word (simple and robust)
    word_labels = []
    for _, idxs in merged:
        labs = [labels[i] for i in idxs if labels[i] != "O"]
        word_labels.append(labs[0] if labs else "O")

    # BIO → spans
    spans = _bio_to_spans(words, word_labels)
    return tokens_labels, spans


In [None]:
for i in range(2):
    text = test_notes[i]["note"]
    toks, ents = predict_entities(text[:800])  # trim view
    print(f"\n=== NOTE {i} ===")
    print("Entities:")
    for e in ents[:20]:
        print(f" - [{e['label']}] {e['text']}")



=== NOTE 0 ===
Entities:
 - [Age] 15
 - [Age] -
 - [Date] year
 - [Age] -
 - [Sex] old
 - [Sex] boy
 - [Clinical_event] admitted
 - [Nonbiological_location] emergency department
 - [Detailed_description] blunt
 - [Biological_structure] abdominal
 - [Sign_symptom] trauma
 - [Sign_symptom] hit
 - [Therapeutic_procedure] intubated
 - [Sign_symptom] haemodynamically
 - [Diagnostic_procedure] blood pressure
 - [Diagnostic_procedure] laboratory tests
 - [Diagnostic_procedure] ct
 - [Diagnostic_procedure] computed tomographic
 - [Biological_structure] thorax
 - [Biological_structure] abdomen

=== NOTE 1 ===
Entities:
 - [Age] 21
 - [Age] -
 - [Date] year
 - [Sex] -
 - [Sex] old
 - [Sex] male
 - [Biological_structure] right
 - [Biological_structure] femur
 - [Biological_structure] splenic
 - [Sign_symptom] injury
 - [Diagnostic_procedure] laparotomy
 - [Therapeutic_procedure] splenectomy
 - [Detailed_description] distal
 - [Biological_structure] femoral
 - [Therapeutic_procedure] traction
 - 

In [None]:
import json, os

os.makedirs("ner_outputs", exist_ok=True)
out_path = "ner_outputs/agbonnet_ner_test100.jsonl"

with open(out_path, "w", encoding="utf-8") as f:
    for row in test_notes:
        text = row["note"]
        _, spans = predict_entities(text)
        rec = {"idx": row.get("idx", None),
               "note": text,
               "entities": spans}
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

out_path


'ner_outputs/agbonnet_ner_test100.jsonl'

In [None]:
import gradio as gr

def gradio_predict(text):
    _, ents = predict_entities(text)
    # present as label → list of strings
    out = {}
    for e in ents:
        out.setdefault(e["label"], []).append(e["text"])
    return out

gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(lines=6, placeholder="Paste clinical text..."),
    outputs="json",
    title="AGBonnet NER (your fine-tuned model)"
).launch(debug=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fc7c44ab7ce098d56e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://fc7c44ab7ce098d56e.gradio.live




In [None]:
!pip install -q "datasets>=2.20.0" "sentence-transformers>=2.6.1" "pinecone>=3.0.0"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/240.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/65.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset

ds = load_dataset("adlbh/umls-concepts")
print(ds)
print(ds["train"][0])   # fields are typically: ENTITY, NAME, ALIASES, DEFINITION


README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/52.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/474872 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ENTITY', 'DEFINITION', 'ALIASES', 'NAME'],
        num_rows: 474872
    })
})
{'ENTITY': 'C0003725', 'DEFINITION': 'Arthropod-borne viruses. A non-taxonomic designation for viruses that can replicate in both vertebrate hosts and arthropod vectors. Included are some members of the following families: ARENAVIRIDAE; BUNYAVIRIDAE; REOVIRIDAE; TOGAVIRIDAE; and FLAVIVIRIDAE. (From Dictionary of Microbiology and Molecular Biology, 2nd ed)', 'ALIASES': 'Arbovirus (navigational concept)|arbovirus|Arboviruses|Arthropod-borne Virus|Viruses, Arthropod-Borne|ARBOVIRUS|arboviruses|Arbovirus|Arthropod Borne Viruses|Virus, Arthropod-Borne|Arbovirus, NOS|Arthropod-Borne Viruses|Arthropod-Borne Virus', 'NAME': 'Arboviruses'}


In [None]:
# Build a list of unique concepts; keep metadata (definition, aliases)
rows_raw = ds["train"]

seen = set()
rows = []
for r in rows_raw:
    cui = r["ENTITY"]
    if cui in seen:
        continue
    seen.add(cui)
    rows.append({
        "cui": cui,
        "name": r["NAME"],
        "definition": r.get("DEFINITION", "") or "",
        "aliases": r.get("ALIASES", []) or []
    })

len(rows), rows[0]


(297086,
 {'cui': 'C0003725',
  'name': 'Arboviruses',
  'definition': 'Arthropod-borne viruses. A non-taxonomic designation for viruses that can replicate in both vertebrate hosts and arthropod vectors. Included are some members of the following families: ARENAVIRIDAE; BUNYAVIRIDAE; REOVIRIDAE; TOGAVIRIDAE; and FLAVIVIRIDAE. (From Dictionary of Microbiology and Molecular Biology, 2nd ed)',
  'aliases': 'Arbovirus (navigational concept)|arbovirus|Arboviruses|Arthropod-borne Virus|Viruses, Arthropod-Borne|ARBOVIRUS|arboviruses|Arbovirus|Arthropod Borne Viruses|Virus, Arthropod-Borne|Arbovirus, NOS|Arthropod-Borne Viruses|Arthropod-Borne Virus'})

In [None]:
from sentence_transformers import SentenceTransformer
import torch, numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_CANDIDATES = [
    "cambridgeltl/SapBERT-from-PubMedBERT-fulltext",  # ✅ main
    "sentence-transformers/all-MiniLM-L6-v2"          # fallback if SapBERT unavailable
]

encoder = None
last_err = None
for mid in MODEL_CANDIDATES:
    try:
        print(f"Loading: {mid}")
        encoder = SentenceTransformer(mid, device=device)
        break
    except Exception as e:
        last_err = e
        print(f"Failed to load {mid}: {e}\nTrying fallback...\n")

if encoder is None:
    raise RuntimeError(f"Could not load any embedding model. Last error:\n{last_err}")


Loading: cambridgeltl/SapBERT-from-PubMedBERT-fulltext




config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
names = [r["name"] for r in rows]

embeddings = encoder.encode(
    names,
    batch_size=64,
    convert_to_numpy=True,
    normalize_embeddings=True,   # cosine-ready
    show_progress_bar=True
)

embeddings.shape  # (N, dim)


Batches:   0%|          | 0/4642 [00:00<?, ?it/s]

(297086, 768)

In [6]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: packaging, pinecone-plugin-assi

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# Path to your key file in Drive
key_path = "/content/drive/My Drive/GenAI_keys/pinecone_api.txt"

# Read key securely
with open(key_path, "r") as f:
    PINECONE_API_KEY = f.read().strip()

print("Pinecone API key loaded successfully.")

from pinecone import Pinecone, ServerlessSpec

CLOUD, REGION = "aws", "us-east-1"
pc = Pinecone(api_key=PINECONE_API_KEY)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Pinecone API key loaded successfully.


In [None]:
from pinecone import Pinecone, ServerlessSpec
import os


assert "PINECONE_API_KEY" in globals() and PINECONE_API_KEY, \
    "API key not found. Please load it from Google Drive first."

CLOUD, REGION = "aws", "us-east-1"


pc = Pinecone(api_key=PINECONE_API_KEY)


index_name = "umls-concepts-sapbert"
dims = embeddings.shape[1]


existing = [idx.name for idx in pc.list_indexes()]
if index_name not in existing:
    pc.create_index(
        name=index_name,
        dimension=dims,
        metric="cosine",
        spec=ServerlessSpec(cloud=CLOUD, region=REGION),
    )


index = pc.Index(index_name)
index.describe_index_stats()


{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'umls-v1': {'vector_count': 297086}},
 'total_vector_count': 297086,
 'vector_type': 'dense'}

In [None]:
NAMESPACE = "umls-v1"

In [None]:
def search_umls(query: str, k: int = 5):
    q_vec = encoder.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(vector=q_vec.tolist(), top_k=k, include_metadata=True, namespace=NAMESPACE)
    return [
        {
            "cui": m.id,
            "score": float(m.score),
            "name": m.metadata.get("name"),
            "aliases": m.metadata.get("aliases", [])[:5]
        }
        for m in res.matches
    ]


In [None]:
for q in ["high blood sugar", "heart attack", "pneumonia", "metformin", "shortness of breath"]:
    print(f"\nQuery: {q}")
    for r in search_umls(q, k=5):
        print(f"  {r['score']:.3f}  {r['cui']}  {r['name']}  {r['aliases']}")



Query: high blood sugar
  0.899  C0495706  elevated blood glucose level  blood
  0.886  C0595877  Blood glucose increased  incre
  0.869  C0860803  glucose high  high 
  0.847  C0017747  Increased glucose level  incre
  0.846  C4692512  Elevated serum glucose (in some patients)  []

Query: heart attack
  0.855  C0235462  angina attack  ANGIN
  0.829  C0741923  cardiac event  cardi
  0.825  C0745413  ischemic attack  attac
  0.776  C0027051  Myocardial Infarction  Infar
  0.775  C0376297  Cardiac Death  cardi

Query: pneumonia
  1.000  C0032285  Pneumonia  lung 
  0.914  C0543829  pneumonia clinical  clini
  0.913  C0729704  Infective pneumonia  infec
  0.861  C3714636  Pneumonitis  Infla
  0.833  C0264376  Non-infectious pneumonia  Pneum

Query: metformin
  1.000  C0025598  metformin  Dimet
  0.873  C0770893  metformin hydrochloride  metfo
  0.696  C4683813  Milademetan  MILAD
  0.695  C0701230  Diabetol  []
  0.694  C4720071  Metformin Hydrochloride Tablets  []

Query: shortness of b

In [None]:
!pip install -q rapidfuzz


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from rapidfuzz.fuzz import token_set_ratio

def lexical_score(q: str, cand: str) -> float:
    """0..1 similarity; robust to word order & small noise."""
    return token_set_ratio(q, cand) / 100.0


In [None]:
from functools import lru_cache

def _lower_list(x):
    return [s.lower() for s in (x or [])]

@lru_cache(maxsize=10000)
def expand_with_umls(mention: str, top_k: int = 5, threshold: float = 0.85) -> str:
    """
    Use Pinecone (UMLS) to expand short/abbreviated mentions to a canonical name.
    If high-confidence match is found (score>=threshold) and looks like an alias/short form,
    return the canonical NAME; otherwise return the original mention.
    """
    q = mention.strip()
    if not q:
        return mention

    # get embedding
    q_vec = encoder.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(
        vector=q_vec.tolist(),
        top_k=top_k,
        include_metadata=True,
        namespace=NAMESPACE,
    )

    if not res.matches:
        return mention

    best = res.matches[0]
    name = (best.metadata.get("name") or "").strip()
    aliases = _lower_list(best.metadata.get("aliases", []))
    score = float(best.score)

    # Heuristics to decide it's really an abbreviation:
    #  - very short original text (<= 5 chars) OR
    #  - original appears in aliases OR
    #  - canonical is clearly longer (more informative) AND score high
    is_short = len(q) <= 5 and " " not in q
    in_aliases = q.lower() in aliases
    more_informative = len(name) > len(q) + 3

    if score >= threshold and (is_short or in_aliases or more_informative):
        return name  # expand to canonical
    return mention  # keep original


In [None]:
from rapidfuzz.fuzz import token_set_ratio

def lexical_score(q: str, cand: str) -> float:
    return token_set_ratio(q, cand) / 100.0

def normalize_mention_to_umls(mention: str, k: int = 5, alpha: float = 0.7, namespace: str = NAMESPACE):
    """
    mention -> best UMLS candidate dict (combines semantic + lexical)
    alpha: weight for semantic cosine score vs lexical similarity (0..1)
    """
    # dynamic expansion
    expanded = expand_with_umls(mention)

    # query Pinecone with expanded text
    q_vec = encoder.encode([expanded], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(vector=q_vec.tolist(), top_k=k, include_metadata=True, namespace=namespace)

    if not res.matches:
        return None

    ranked = []
    for m in res.matches:
        name = (m.metadata.get("name") or "").strip()
        sem = float(m.score)
        lex = lexical_score(expanded, name) if name else 0.0
        combo = alpha * sem + (1.0 - alpha) * lex
        ranked.append({
            "cui": m.id,
            "name": name,
            "semantic": sem,
            "lexical": lex,
            "score": combo,
            "aliases": m.metadata.get("aliases", [])[:5],
        })

    ranked.sort(key=lambda x: x["score"], reverse=True)
    best = ranked[0]
    return {"mention": mention, "expanded": expanded, **best}


In [None]:
def normalize_note(text: str, k: int = 5, alpha: float = 0.7, score_threshold: float = 0.58):
    """
    returns:
      {
        "entities": [ {label, text} ... ],
        "normalized": [ {mention, expanded, cui, name, score, semantic, lexical, aliases} ... ],
        "uncertain": [ ... ]   # below threshold or no matches
      }
    """
    _, spans = predict_entities(text)
    mentions = [s["text"] for s in spans]

    # dedupe while preserving order
    seen, uniq = set(), []
    for m in mentions:
        key = m.lower().strip()
        if key not in seen:
            seen.add(key)
            uniq.append(m)

    normalized, uncertain = [], []
    for m in uniq:
        hit = normalize_mention_to_umls(m, k=k, alpha=alpha, namespace=NAMESPACE)
        if not hit:
            uncertain.append({"mention": m, "reason": "no_matches"})
            continue
        if hit["score"] >= score_threshold:
            normalized.append(hit)
        else:
            hit["reason"] = "low_score"
            uncertain.append(hit)

    return {
        "entities": spans,
        "normalized": normalized,
        "uncertain": uncertain
    }


In [None]:
test_note = """A 62-year-old male with a history of hypertension and diabetes presented
to the emergency department complaining of SOB and chest pain for 2 days.
ECG showed ST-segment elevations consistent with MI.
Labs revealed elevated glucose and BNP. Started on ASA, clopidogrel, and IV furosemide.
Scheduled echo to assess LV function."""
res = normalize_note(test_note, k=5, alpha=0.7, score_threshold=0.58)
res


{'entities': [{'label': 'Age', 'text': '62', 'start': None, 'end': None},
  {'label': 'Age', 'text': '-', 'start': None, 'end': None},
  {'label': 'Date', 'text': 'year', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'old', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'male', 'start': None, 'end': None},
  {'label': 'Sign_symptom',
   'text': 'hypertension',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'diabetes', 'start': None, 'end': None},
  {'label': 'Clinical_event', 'text': 'presented', 'start': None, 'end': None},
  {'label': 'Sign_symptom', 'text': 'sob', 'start': None, 'end': None},
  {'label': 'Biological_structure',
   'text': 'chest',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'pain', 'start': None, 'end': None},
  {'label': 'Diagnostic_procedure', 'text': 'ecg', 'start': None, 'end': None},
  {'label': 'Biological_structure', 'text': 'st', 'start': None, 'end': None},
  {'label': 'Diagnostic_procedu

In [None]:
import gradio as gr

def gradio_normalize(text):
    out = normalize_note(text, k=5, alpha=0.7, score_threshold=0.58)
    return {
        "normalized": [
            {"mention": r["mention"], "expanded": r["expanded"], "cui": r["cui"], "term": r["name"], "score": round(r["score"], 3)}
            for r in out["normalized"]
        ],
        "uncertain": [
            {"mention": r["mention"], "reason": r.get("reason", "low_score")}
            for r in out["uncertain"]
        ]
    }

gr.Interface(
    fn=gradio_normalize,
    inputs=gr.Textbox(lines=8, placeholder="Paste a clinical note..."),
    outputs="json",
    title="Clinical NER → UMLS Normalization (Dynamic Abbrev Expansion)",
    description="Uses your fine-tuned NER + SapBERT + Pinecone(UMLS)."
).launch(debug=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://8e00ec32c8589a39ca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://8e00ec32c8589a39ca.gradio.live




In [None]:
# What labels do we actually want to normalize?
NORMALIZE_LABELS = {
    "Disease_disorder",
    "Medication",
    "Sign_symptom",
    "Therapeutic_procedure",
    "Diagnostic_procedure",
    "Lab_value",
    "Clinical_event",
    "Outcome",
}

# Labels to always skip (you can refine)
SKIP_LABELS = {
    "Age", "Sex", "Subject", "Coreference", "Date", "Time",
    "Height", "Volume", "Area", "Distance", "Shape", "Texture",
    "Color", "History", "Personal_background", "Occupation",
    "Other_entity", "Other_event", "Nonbiological_location",
}

# Basic text filters
import re
STOP_TOKENS = {",", ".", ";", ":", "-", "(", ")", "[", "]", "{", "}", "/", "\\", "'", '"'}
def looks_trivial(s: str) -> bool:
    s_ = s.strip().lower()
    if not s_:
        return True
    if s_ in STOP_TOKENS:
        return True
    if re.fullmatch(r"[\W_]+", s_):  # pure punctuation
        return True
    if re.fullmatch(r"\d+(\.\d+)?", s_):  # pure number
        return True
    if len(s_) == 1:  # single char
        return True
    return False


In [None]:
from functools import lru_cache

def _lower_list(x):
    return [s.lower() for s in (x or [])]

@lru_cache(maxsize=10000)
def expand_with_umls(mention: str, top_k: int = 10, threshold: float = 0.80) -> str:
    """
    Use Pinecone(UMLS) to expand short/abbrev mentions.
    Prefer candidates whose aliases contain the exact mention, or whose name is clearly more informative.
    """
    q = mention.strip()
    if not q:
        return mention

    q_vec = encoder.encode([q], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(vector=q_vec.tolist(), top_k=top_k, include_metadata=True, namespace=NAMESPACE)
    if not res.matches:
        return mention

    q_l = q.lower()
    best = None
    # 1) Prefer alias-exact matches
    for m in res.matches:
        aliases = _lower_list(m.metadata.get("aliases", []))
        if q_l in aliases:
            best = m
            break

    # 2) Else fall back to top score, but only if more informative and confident
    if best is None:
        top = res.matches[0]
        name = (top.metadata.get("name") or "").strip()
        if float(top.score) >= threshold and len(name) > len(q) + 3:
            best = top

    # 3) If still none, keep original
    if best is None:
        return mention

    return (best.metadata.get("name") or mention).strip()


In [None]:
from rapidfuzz.fuzz import token_set_ratio

def lexical_score(q: str, cand: str) -> float:
    return token_set_ratio(q, cand) / 100.0

def normalize_mention_to_umls(mention: str, k: int = 10, alpha: float = 0.7, alias_bonus: float = 0.1, namespace: str = NAMESPACE):
    expanded = expand_with_umls(mention)

    q_vec = encoder.encode([expanded], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(vector=q_vec.tolist(), top_k=k, include_metadata=True, namespace=namespace)
    if not res.matches:
        return None

    ranked = []
    q_l = expanded.lower()
    for m in res.matches:
        name = (m.metadata.get("name") or "").strip()
        sem = float(m.score)
        lex = lexical_score(expanded, name) if name else 0.0
        combo = alpha * sem + (1 - alpha) * lex

        aliases = _lower_list(m.metadata.get("aliases", []))
        if q_l in aliases:
            combo += alias_bonus  # boost exact alias matches

        ranked.append({
            "cui": m.id,
            "name": name,
            "semantic": sem,
            "lexical": lex,
            "score": combo,
            "aliases": m.metadata.get("aliases", [])[:5],
        })

    ranked.sort(key=lambda x: x["score"], reverse=True)
    best = ranked[0]
    return {"mention": mention, "expanded": expanded, **best}


In [None]:
def filter_mentions(spans):
    """Select only meaningful mentions to normalize."""
    keep = []
    for s in spans:
        lab = s["label"]
        txt = (s["text"] or "").strip()
        if lab in SKIP_LABELS:
            continue
        if lab not in NORMALIZE_LABELS:
            # optional: allow chest pain even if model labels separately
            # you can handle special cases here
            continue
        if looks_trivial(txt):
            continue
        keep.append((lab, txt))
    return keep


In [None]:
def normalize_note(text: str, k: int = 10, alpha: float = 0.7, score_threshold: float = 0.60):
    """
    returns:
      {
        "entities": [ {label, text} ... ],  # raw spans from NER
        "normalized": [ {mention, expanded, cui, name, score, semantic, lexical, aliases} ... ],
        "uncertain": [ ... ]   # below threshold or no matches
      }
    """
    _, spans = predict_entities(text)
    mentions = filter_mentions(spans)

    # dedupe by text (case-insensitive)
    seen, uniq = set(), []
    for lab, txt in mentions:
        key = txt.lower().strip()
        if key not in seen:
            seen.add(key)
            uniq.append((lab, txt))

    normalized, uncertain = [], []
    for lab, txt in uniq:
        hit = normalize_mention_to_umls(txt, k=k, alpha=alpha, alias_bonus=0.12, namespace=NAMESPACE)
        if not hit:
            uncertain.append({"mention": txt, "label": lab, "reason": "no_matches"})
            continue
        if hit["score"] >= score_threshold:
            hit["label"] = lab
            normalized.append(hit)
        else:
            hit["label"] = lab
            hit["reason"] = "low_score"
            uncertain.append(hit)

    return {
        "entities": spans,          # for debugging
        "normalized": normalized,   # confident mappings
        "uncertain": uncertain      # low-score/no-match
    }


In [None]:
test_note = """A 62-year-old male with a history of hypertension and diabetes presented
to the emergency department complaining of SOB and chest pain for 2 days.
ECG showed ST-segment elevations consistent with myocardial infarction.
Labs revealed elevated glucose and BNP. Started on aspirin, clopidogrel, and IV furosemide.
Scheduled echo to assess LV function."""
res = normalize_note(test_note, k=10, alpha=0.7, score_threshold=0.60)
res


{'entities': [{'label': 'Age', 'text': '62', 'start': None, 'end': None},
  {'label': 'Age', 'text': '-', 'start': None, 'end': None},
  {'label': 'Date', 'text': 'year', 'start': None, 'end': None},
  {'label': 'Sex', 'text': '-', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'old', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'male', 'start': None, 'end': None},
  {'label': 'Sign_symptom',
   'text': 'hypertension',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'diabetes', 'start': None, 'end': None},
  {'label': 'Clinical_event', 'text': 'presented', 'start': None, 'end': None},
  {'label': 'Sign_symptom', 'text': 'sob', 'start': None, 'end': None},
  {'label': 'Biological_structure',
   'text': 'chest',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'pain', 'start': None, 'end': None},
  {'label': 'Diagnostic_procedure', 'text': 'ecg', 'start': None, 'end': None},
  {'label': 'Biological_structure', 'text': 'st',

In [None]:
import gradio as gr
import json

EXAMPLE_NOTE = """A 62-year-old male with a history of hypertension and diabetes presented
to the emergency department complaining of SOB and chest pain for 2 days.
ECG showed ST-segment elevations consistent with MI.
Labs revealed elevated glucose and BNP. Started on aspirin, clopidogrel, and IV furosemide.
Scheduled echo to assess LV function."""

def run_ner(text):
    _, spans = predict_entities(text)
    # present as label -> [mentions]
    out = {}
    for s in spans:
        out.setdefault(s["label"], []).append(s["text"])
    return out

def run_normalize(text, k, alpha, score_threshold):
    res = normalize_note(text, k=int(k), alpha=float(alpha), score_threshold=float(score_threshold))
    # compact the normalized & uncertain lists
    normalized = [
        {
            "label": r.get("label", ""),
            "mention": r["mention"],
            "expanded": r["expanded"],
            "cui": r["cui"],
            "term": r["name"],
            "score": round(r["score"], 3),
            "semantic": round(r["semantic"], 3),
            "lexical": round(r["lexical"], 3),
        }
        for r in res["normalized"]
    ]
    uncertain = [
        {
            "label": r.get("label", ""),
            "mention": r["mention"],
            "reason": r.get("reason", "low_score")
        }
        for r in res["uncertain"]
    ]
    return {
        "normalized": normalized,
        "uncertain": uncertain
    }

with gr.Blocks(title="Clinical NER → UMLS Normalization") as demo:
    gr.Markdown("## Clinical NER → UMLS Normalization\nUses your fine-tuned NER + SapBERT embeddings + Pinecone (UMLS index).")

    with gr.Tab("1) NER only"):
        inp1 = gr.Textbox(lines=10, label="Clinical note")
        ner_btn = gr.Button("Extract Entities")
        ner_out = gr.JSON(label="Entities (grouped by NER label)")
        ner_btn.click(run_ner, inputs=inp1, outputs=ner_out)
        gr.Examples([EXAMPLE_NOTE], inputs=inp1, label="Try an example")

    with gr.Tab("2) Normalize to UMLS"):
        inp2 = gr.Textbox(lines=10, label="Clinical note")
        with gr.Row():
            k_in = gr.Slider(3, 20, value=10, step=1, label="Top-k (Pinecone)")
            alpha_in = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Semantic weight (alpha)")
            thr_in = gr.Slider(0.4, 0.9, value=0.60, step=0.01, label="Accept threshold")
        norm_btn = gr.Button("Normalize Entities")
        norm_out = gr.JSON(label="UMLS normalization (normalized / uncertain)")
        norm_btn.click(run_normalize, inputs=[inp2, k_in, alpha_in, thr_in], outputs=norm_out)
        gr.Examples([EXAMPLE_NOTE], inputs=inp2, label="Try an example")

demo.launch(debug=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://3fc9dee122a18aa6b1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://3fc9dee122a18aa6b1.gradio.live




In [None]:
!pip install -q google-generativeai


In [4]:
import os
import google.generativeai as genai
from google.colab import drive


drive.mount('/content/drive')


key_path = "/content/drive/My Drive/GenAI_keys/gemini_api.txt"


with open(key_path, "r") as f:
    GEMINI_API_KEY = f.read().strip()


os.environ["GOOGLE_API_KEY"] = GEMINI_API_KEY
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


GEMINI_MODEL_NAME = "gemini-1.5-flash"  # or "gemini-1.5-pro"

generation_config = {
    "temperature": 0.0,
    "response_mime_type": "application/json",
}


gemini_model = genai.GenerativeModel(
    model_name=GEMINI_MODEL_NAME,
    generation_config=generation_config,
)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
LLM_SCORE_MIN = 0.60   # below this, escalate to LLM
LLM_MARGIN    = 0.05   # if top1 - top2 < margin, escalate to LLM

def needs_llm(normalized_hit, top2_gap=None):
    if normalized_hit is None:
        return True
    if normalized_hit["score"] < LLM_SCORE_MIN:
        return True
    if top2_gap is not None and top2_gap < LLM_MARGIN:
        return True
    return False


In [None]:
def context_window(text, mention, win=120):
    i = text.lower().find(mention.lower())
    if i == -1:
        return ""
    s = max(0, i - win)
    e = min(len(text), i + len(mention) + win)
    return text[s:e].replace("\n", " ")

def pinecone_topk(mention_or_expanded: str, k: int = 5):
    vec = encoder.encode([mention_or_expanded], convert_to_numpy=True, normalize_embeddings=True)[0]
    res = index.query(vector=vec.tolist(), top_k=k, include_metadata=True, namespace=NAMESPACE)
    return [
        {
            "rank": j+1,
            "cui": m.id,
            "name": (m.metadata.get("name") or "").strip(),
            "score": float(m.score),
            "aliases": (m.metadata.get("aliases", []) or [])[:5]
        }
        for j, m in enumerate(getattr(res, "matches", []) or [])
    ]


In [None]:
import json

def build_prompt_json(note_snippet: str, mention: str, candidates: list[dict]) -> str:
    """
    Build a compact prompt instructing Gemini to return strict JSON:
    { "cui": "...", "term": "...", "rationale": "..." }
    """
    cand_lines = []
    for c in candidates:
        cand_lines.append({
            "rank": c["rank"],
            "cui": c["cui"],
            "term": c["name"],
            "score": round(c["score"], 3),
            "aliases": c["aliases"]
        })

    # Provide the RAG packet as JSON to avoid hallucinations
    packet = {
        "task": "umls_disambiguation",
        "instructions": [
            "Choose exactly ONE best UMLS concept for the mention.",
            "Use the clinical context and the provided candidates.",
            "Prefer clinically correct sense for this context.",
            "Respond ONLY with JSON: {\"cui\": \"...\", \"term\": \"...\", \"rationale\": \"...\"}."
        ],
        "context": {
            "note_snippet": note_snippet,
            "mention": mention,
            "candidates": cand_lines
        },
        "output_schema": {"cui":"string","term":"string","rationale":"string"}
    }
    # We’ll pass this JSON as plain text to Gemini (it will return JSON)
    return json.dumps(packet, ensure_ascii=False)

def gemini_disambiguate(note_snippet: str, mention: str, candidates: list[dict]) -> dict:
    """
    Calls Gemini with a structured RAG packet; expects strict JSON back.
    """
    prompt = build_prompt_json(note_snippet, mention, candidates)
    try:
        resp = gemini_model.generate_content(prompt)
        txt = resp.text.strip()
        # Gemini should return JSON (thanks to response_mime_type); parse it
        data = json.loads(txt)
        # basic sanity
        return {
            "cui": data.get("cui", "") or "",
            "term": data.get("term", "") or "",
            "rationale": data.get("rationale", "") or "",
            "raw": txt
        }
    except Exception as e:
        # Fallback: return empty with raw text for debugging
        return {"cui": "", "term": "", "rationale": f"Gemini error: {e}", "raw": ""}


In [None]:
def llm_disambiguate_with_gemini(note_text: str, label: str, mention: str, expanded: str, candidates: list[dict]) -> dict:
    snippet = context_window(note_text, mention, win=120)
    out = gemini_disambiguate(snippet, mention, candidates)
    return {
        "label": label,
        "mention": mention,
        "expanded": expanded,
        "cui": out.get("cui", ""),
        "name": out.get("term", ""),
        "rationale": out.get("rationale", ""),
        "source": "gemini",
        "raw": out.get("raw", "")
    }


In [None]:
def normalize_note_with_gemini(text: str, k: int = 7, alpha: float = 0.7, score_threshold: float = 0.60):
    """
    Uses your semantic+lexical ranker first.
    If low score or top-2 too close, escalates to Gemini RAG disambiguation.
    Returns a list of final picks with source = 'semantic' or 'gemini'.
    """
    # 1) raw NER → spans
    _, spans = predict_entities(text)
    pairs = filter_mentions(spans)

    # 2) dedupe mentions (case-insensitive)
    seen, uniq = set(), []
    for lab, txt in pairs:
        key = txt.lower().strip()
        if key not in seen:
            seen.add(key)
            uniq.append((lab, txt))

    reviewed = []
    for lab, mention in uniq:
        # (A) our standard normalization
        hit = normalize_mention_to_umls(mention, k=k, alpha=alpha, namespace=NAMESPACE)

        # compute top-2 gap for escalation rule
        expanded = expand_with_umls(mention)
        cands = pinecone_topk(expanded, k=max(3, k))
        top2_gap = None
        if len(cands) >= 2:
            top2_gap = cands[0]["score"] - cands[1]["score"]

        if needs_llm(hit, top2_gap):
            # (B) ask Gemini with RAG
            pick = llm_disambiguate_with_gemini(text, lab, mention, expanded, cands[:5])
            if pick["cui"] and pick["name"]:
                reviewed.append(pick)   # gemini decision
            else:
                # fallback to semantic if Gemini unusable
                if hit:
                    hit["label"] = lab
                    hit["source"] = "semantic"
                    reviewed.append(hit)
                else:
                    reviewed.append({
                        "label": lab, "mention": mention, "expanded": expanded,
                        "cui": "", "name": "",
                        "rationale": "no matches",
                        "source": "none"
                    })
        else:
            hit["label"] = lab
            hit["source"] = "semantic"
            reviewed.append(hit)

    return {
        "entities": spans,   # raw NER (for reference)
        "final": reviewed    # chosen per mention (semantic or gemini)
    }


In [None]:
test_note = """A 62-year-old male with a history of hypertension and diabetes presented
to the emergency department complaining of SOB and chest pain for 2 days.
ECG showed ST-segment elevations consistent with MI.
Labs revealed elevated glucose and BNP. Started on aspirin, clopidogrel, and IV furosemide.
Scheduled echo to assess LV function."""

out = normalize_note_with_gemini(test_note, k=7, alpha=0.7, score_threshold=0.60)
out


{'entities': [{'label': 'Age', 'text': '62', 'start': None, 'end': None},
  {'label': 'Age', 'text': '-', 'start': None, 'end': None},
  {'label': 'Date', 'text': 'year', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'old', 'start': None, 'end': None},
  {'label': 'Sex', 'text': 'male', 'start': None, 'end': None},
  {'label': 'Sign_symptom',
   'text': 'hypertension',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'diabetes', 'start': None, 'end': None},
  {'label': 'Clinical_event', 'text': 'presented', 'start': None, 'end': None},
  {'label': 'Sign_symptom', 'text': 'sob', 'start': None, 'end': None},
  {'label': 'Biological_structure',
   'text': 'chest',
   'start': None,
   'end': None},
  {'label': 'Sign_symptom', 'text': 'pain', 'start': None, 'end': None},
  {'label': 'Diagnostic_procedure', 'text': 'ecg', 'start': None, 'end': None},
  {'label': 'Biological_structure', 'text': 'st', 'start': None, 'end': None},
  {'label': 'Diagnostic_procedu

Comparison

In [None]:
import json
from collections import defaultdict

# Map model labels → display buckets used in your figure
DISPLAY_MAP = {
    "Age": "AGE",
    "Sex": "SEX",
    "Disease_disorder": "DISEASE_DISORDER",
    "Diagnostic_procedure": "DIAGNOSTIC_PROCEDURE",
    "Sign_symptom": "SIGN_SYMPTOM",
    "Lab_value": "LAB_VALUE",
    "Detailed_description": "DETAILED_DESCRIPTION",
    "Medication": "MEDICATION",
    # optional: add others if you use them
    "Biological_structure": "BIOLOGICAL_STRUCTURE",
    "Texture": "TEXTURE",
    "Date": "DATE",
}

# Which fields should be a SINGLE value vs. a LIST in the final panel
SINGLE_FIELDS = {"AGE", "SEX"}     # everything else becomes a list

def _bucket_for_label(label: str) -> str | None:
    return DISPLAY_MAP.get(label) or DISPLAY_MAP.get(label.title()) or None

def _uniq_push(dlist: list[str], val: str):
    v = val.strip()
    if v and v not in dlist:
        dlist.append(v)


In [None]:
import re

AGE_PATTERNS = [
    r'\b(\d{1,3})\s*-\s*year\s*-\s*old\b',          # 20-year-old
    r'\b(\d{1,3})\s*year(?:s)?\s*old\b',             # 20 years old
    r'\baged?\s*(\d{1,3})\b',                        # age 20 / aged 20
    r'\b(\d{1,3})\s*(?:yo|y/o|yrs?)\b',              # 20 yo / 20 y/o / 20 yrs
]

SEX_PATTERNS = [
    (r'\bmale\b',    'male'),
    (r'\bman\b',     'male'),
    (r'\bmale patient\b', 'male'),
    (r'\bfemale\b',  'female'),
    (r'\bwoman\b',   'female'),
    (r'\bfemale patient\b', 'female'),
    # very simple M/F near age; keep conservative to avoid false positives
    (r'\b(?<![A-Z])[mM]\b', 'male'),
    (r'\b(?<![A-Z])[fF]\b', 'female'),
]

def extract_age_sex(text: str):
    age = None
    for pat in AGE_PATTERNS:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            try:
                a = int(m.group(1))
                if 0 < a < 120:
                    age = str(a)
                    break
            except:
                pass

    sex = None
    # prefer a sex mention that occurs near the age phrase, fallback to global
    # (simple heuristic: search whole text in order)
    for pat, s in SEX_PATTERNS:
        if re.search(pat, text, flags=re.IGNORECASE):
            sex = s
            break

    return age, sex


In [None]:
from collections import defaultdict

def fix_age_sex_entities(text: str, entities: dict) -> dict:
    """
    1) remove spurious 'old' from SEX (and anywhere else)
    2) set AGE/SEX from robust regex extractor (overrides model)
    3) normalize values to simple strings: AGE ['20'], SEX ['male'/'female']
    """
    cleaned = defaultdict(list)

    for label, values in entities.items():
        for v in values:
            if isinstance(v, str) and v.strip().lower() == "old":
                continue
            cleaned[label].append(v)

    # overwrite AGE/SEX using regex from raw text
    age, sex = extract_age_sex(text)

    if age:
        cleaned["AGE"] = [age]
    else:

        nums = [v for v in cleaned.get("AGE", []) if str(v).isdigit()]
        cleaned["AGE"] = [nums[0]] if nums else []

    if sex:
        cleaned["SEX"] = [sex]
    else:

        xf = [v for v in cleaned.get("SEX", []) if v.strip().lower() in {"male","female"}]
        cleaned["SEX"] = xf[:1] if xf else []


    return {k: v for k, v in cleaned.items()}


In [None]:
def ner_panel(note_text: str) -> dict:
    """
    Returns: dict like
    {
      "AGE": ["60 - years old"],
      "SEX": ["Male"],
      "DISEASE_DISORDER": ["ckd", "lvh"],
      ...
    }
    using the *raw* NER strings (no UMLS normalization).
    """
    # Get all spans from your fine-tuned model
    entities, spans = predict_entities(note_text)         # spans: [{"label","text",...},...]

    # Create a dictionary of entities from the spans, grouped by label
    entities_dict = defaultdict(list)
    for sp in spans:
        lab = sp.get("label") or sp.get("Label") or ""
        txt = sp.get("text") or sp.get("Text") or ""
        entities_dict[lab].append(txt)

    cleaned_entities = fix_age_sex_entities(note_text, entities_dict)

    buckets = defaultdict(list)
    for lab, texts in cleaned_entities.items():
        bucket = _bucket_for_label(lab)
        if not bucket:
            continue
        for txt in texts:
            _uniq_push(buckets[bucket], txt)

    # Enforce single-value semantics for AGE/SEX
    final = {}
    for k, v in buckets.items():
        if k in SINGLE_FIELDS:
            # keep first textual value if any
            final[k] = v[:1] if v else []
        else:
            final[k] = v
    return final

In [None]:
from collections import defaultdict

# If you don't already have these:
SINGLE_FIELDS = {"AGE", "SEX"}

def _as_list(x):
    return x if isinstance(x, list) else [x]


In [None]:
def normalized_panel(note_text: str, k: int = 5, threshold: float = 0.55) -> dict:
    """
    Returns a dict like:
      {
        "AGE": "60",
        "SEX": "Male",
        "SIGN_SYMPTOM": ["Shortness of breath", "Chest pain"],
        "DIAGNOSTIC_PROCEDURE": ["Electrocardiogram", "BNP test"],
        ...
      }
    """
    # 1) Reuse the left-panel (raw string mentions) we already produce
    left = ner_panel(note_text)
    out  = defaultdict(list)

    # 2) For each mention, query UMLS (pinecone) and take the best hit (if above threshold)
    for label, mentions in left.items():
        for m in _as_list(mentions):
            q = str(m).strip()
            if not q:
                continue

            hits = []
            try:
                hits = search_umls(q, k=k)   # <-- your UMLS vector search
            except Exception:
                pass

            if hits:
                best = max(hits, key=lambda r: r.get("score", 0.0))
                if best.get("score", 0.0) >= threshold:
                    out[label].append(best.get("name", q))
                else:
                    # below threshold → keep the original text
                    out[label].append(q)
            else:
                # no hits → keep the original text
                out[label].append(q)

    # 3) Enforce single-valued fields (AGE/SEX)
    final = {}
    for lab, vals in out.items():
        final[lab] = vals[0] if lab in SINGLE_FIELDS and len(vals) > 0 else vals
    return final


In [None]:
def render_two_panels(note_text: str):
    left = ner_panel(note_text)
    right = normalized_panel(note_text)

    print("Clinical Note\n" + "-"*80)
    print(note_text.strip(), "\n")

    print("NER from fine-tuned BERT\n" + "-"*80)
    print(json.dumps(left, indent=2, ensure_ascii=False), "\n")

    print("Normalised terms from RAG Pipeline\n" + "-"*80)
    print(json.dumps(right, indent=2, ensure_ascii=False))


In [None]:
note = """A 60-year-old male with CAD and CKD was presented of SOB and orthopnea.
ECG revealed LVH. On exam, there was JVD, S3, and bibasilar rales. The patient was given furosemide IV and started on ACEI.
Labs showed ↑BNP and low EF. He is scheduled for TTE and MIBI scan to assess perfusion and function."""

render_two_panels(note)


Clinical Note
--------------------------------------------------------------------------------
A 60-year-old male with CAD and CKD was presented of SOB and orthopnea.
ECG revealed LVH. On exam, there was JVD, S3, and bibasilar rales. The patient was given furosemide IV and started on ACEI.
Labs showed ↑BNP and low EF. He is scheduled for TTE and MIBI scan to assess perfusion and function. 

NER from fine-tuned BERT
--------------------------------------------------------------------------------
{
  "AGE": [
    "60"
  ],
  "DATE": [
    "year"
  ],
  "SEX": [
    "-"
  ],
  "SIGN_SYMPTOM": [
    "cad",
    "ckd"
  ],
  "DIAGNOSTIC_PROCEDURE": [
    "ecg",
    "tte",
    "mibi"
  ],
  "BIOLOGICAL_STRUCTURE": [
    "bibasilar"
  ],
  "MEDICATION": [
    "furosemide"
  ],
  "DETAILED_DESCRIPTION": [
    "↑bnp"
  ]
} 

Normalised terms from RAG Pipeline
--------------------------------------------------------------------------------
{
  "AGE": "Sixty",
  "DATE": [
    "year"
  ],
  "SEX": 

In [None]:
from datasets import load_dataset

try:
    # If you already built a held-out set earlier, use it
    heldout = test_notes.select(range(min(100, len(test_notes))))
    print(f"Using existing test_notes; evaluating {len(heldout)} notes.")
except NameError:
    # Otherwise load and take 100 deterministic examples
    ds = load_dataset("AGBonnet/augmented-clinical-notes")
    pool = ds["train"].shuffle(seed=42)
    heldout = pool.select(range(100))
    print(f"No test_notes found; sampled {len(heldout)} notes from dataset.")


Using existing test_notes; evaluating 100 notes.


testing on 40 notes

In [None]:
import os, json, csv, time
from tqdm.auto import tqdm

os.makedirs("eval40", exist_ok=True)
JSONL_PATH = "eval40/panels_100.jsonl"
FLAT_CSV   = "eval40/normalized_100_flat.csv"

n_written = 0
t0 = time.time()

flat_rows = []
with open(JSONL_PATH, "w", encoding="utf-8") as jf:
    for r in tqdm(heldout, total=len(heldout)):
        note_id = r.get("idx", None)
        text    = r["note"]

        left  = ner_panel(text)            # raw NER panel
        right = normalized_panel(text)     # normalized (semantic+Gemini) panel

        # Also capture the underlying decisions (for stats)
        rag = normalize_note_with_gemini(text, k=7, alpha=0.7, score_threshold=0.60)
        final = rag["final"]

        # JSONL record (one per note)
        rec = {
            "note_id": note_id,
            "note": text,
            "ner_panel": left,
            "normalized_panel": right,
            "decisions": final,  # list of {label, mention, cui, name, score?, source, ...}
        }
        jf.write(json.dumps(rec, ensure_ascii=False) + "\n")
        n_written += 1

        # Flat CSV rows (one per normalized mention)
        for d in final:
            flat_rows.append({
                "note_id": note_id,
                "label": d.get("label", ""),
                "mention": d.get("mention", ""),
                "cui": d.get("cui", ""),
                "term": d.get("name", ""),
                "score": round(d.get("score", 0.0), 3) if isinstance(d.get("score", 0.0), (int, float)) else "",
                "source": d.get("source", ""),
            })

# write the flat CSV
if flat_rows:
    with open(FLAT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=list(flat_rows[0].keys()))
        writer.writeheader()
        writer.writerows(flat_rows)

print(f"Saved {n_written} notes to {JSONL_PATH} and {len(flat_rows)} rows to {FLAT_CSV} in {time.time()-t0:.1f}s")


  0%|          | 0/100 [00:00<?, ?it/s]



Saved 100 notes to eval40/panels_100.jsonl and 2711 rows to eval40/normalized_100_flat.csv in 3368.7s


In [None]:
import json

total_mentions = 0
norm_mentions  = 0
gemini_used    = 0
scores         = []

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        final = rec.get("decisions", [])
        total_mentions += len(final)
        for d in final:
            if d.get("cui"):   # normalized to some concept
                norm_mentions += 1
            if (d.get("source","").lower().startswith("gemini")):
                gemini_used += 1
            s = d.get("score", None)
            if isinstance(s, (int, float)):
                scores.append(float(s))

coverage = (norm_mentions / max(1, total_mentions)) * 100.0
avg_score = sum(scores) / len(scores) if scores else 0.0
gemini_rate = (gemini_used / max(1, total_mentions)) * 100.0

print(f"Notes: {n_written}")
print(f"Mentions (pipeline decisions): {total_mentions}")
print(f"Normalized to a CUI: {norm_mentions} ({coverage:.1f}%)")
print(f"Avg semantic score (when present): {avg_score:.3f}")
print(f"Gemini disambiguations: {gemini_used} ({gemini_rate:.1f}%)")


Notes: 100
Mentions (pipeline decisions): 2711
Normalized to a CUI: 2711 (100.0%)
Avg semantic score (when present): 0.940
Gemini disambiguations: 104 (3.8%)


In [None]:
def show_panels_for(n=3):
    for i in range(min(n, len(heldout))):
        r = heldout[i]
        print(f"\n================ Note {i} (idx={r.get('idx')}) ================\n")
        render_two_panels(r["note"])

show_panels_for(100)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    "Level II",
    "Normal",
    "second (number)",
    "Bone crest",
    "Crella",
    "Full thickness",
    "Round shape",
    "Tooth Demineralization",
    "Freezing",
    "Dried Specimen",
    "Three",
    "e antigen"
  ],
  "SIGN_SYMPTOM": [
    "Malocclusion",
    "Tooth structure"
  ],
  "DIAGNOSTIC_PROCEDURE": [
    "Periodontic procedure",
    "Endodontics",
    "Cortical destruction",
    "carbide",
    "Skeletal bone",
    "Allograft surgical material"
  ],
  "LAB_VALUE": [
    "48"
  ],
  "BIOLOGICAL_STRUCTURE": [
    "Mouth, Edentulous",
    "Mucilages",
    "Buccal",
    "Tongue",
    "Cerebral cortex",
    "Cancellous Bone",
    "Skeletal bone"
  ]
}


Clinical Note
--------------------------------------------------------------------------------
A 24-year-old man was brought to the emergency department (ED) of our hospital because of suddenly started abdominal pain, altered mental status, and agitation for

In [None]:
def show_panels_range(start=0, end=10):
    for i in range(start, min(end, len(heldout))):
        r = heldout[i]
        print(f"\n================ Note {i} (idx={r.get('idx')}) ================\n")
        render_two_panels(r["note"])

# Example: show notes 25–66 (inclusive of 25, exclusive of 67)
show_panels_range(25, 67)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
    "head",
    "hip",
    "dorso",
    "cranial",
    "hip joint"
  ],
  "DIAGNOSTIC_PROCEDURE": [
    "radiograph",
    "contrast",
    "computed tomography",
    "radiography"
  ],
  "DETAILED_DESCRIPTION": [
    "posterior",
    "displaced",
    "skeletal",
    "herbert",
    "spring",
    "slight"
  ]
} 

Normalised terms from RAG Pipeline
--------------------------------------------------------------------------------
{
  "AGE": "Sixty",
  "DATE": [
    "year",
    "area 15 of Brodmann"
  ],
  "SEX": "Males",
  "SIGN_SYMPTOM": [
    "Pain",
    "Swelling",
    "Unable",
    "Wild bear",
    "Weight",
    "Accidents",
    "Injury wounds",
    "adduct",
    "Movement",
    "Pain",
    "Dislocations",
    "Fracture",
    "Disrupted wound",
    "Lesion",
    "Rotated"
  ],
  "BIOLOGICAL_STRUCTURE": [
    "Right hip region structure",
    "Right lower extremity",
    "Motor Vehicles",
    "Right hip region structure",
  

In [None]:
def show_panels_range(start=0, end=10):
    for i in range(start, min(end, len(heldout))):
        r = heldout[i]
        print(f"\n================ Note {i} (idx={r.get('idx')}) ================\n")
        render_two_panels(r["note"])

# Example: show notes 25–66 (inclusive of 25, exclusive of 67)
show_panels_range(24, 31)




Clinical Note
--------------------------------------------------------------------------------
A 16-year-old boy presented with anterior open bite and infra-occlusion of the maxillary left incisor. His anterior teeth had been injured in a fall when he was 8 years old. He had no dental treatment before attending the orthodontic department. According to the patient, his open bite had developed gradually. His facial profile was straight with a slightly retruded mental region. Facial analysis showed symmetry and a good balance between the facial thirds. The patient did not like to smile as he was ashamed of his teeth (Figure ). He also had a compensatory tongue thrust habit caused by the anterior open bite.An intraoral examination (Figure ) showed that the patient had a severe anterior open bite extending from the left maxillary canine to the right lateral incisor. The molar relationship was Class I, and there was a small space between the maxillary right lateral incisor and the canine. 

In [None]:
def show_panels_range(start=0, end=10):
    for i in range(start, min(end, len(heldout))):
        r = heldout[i]
        print(f"\n================ Note {i} (idx={r.get('idx')}) ================\n")
        render_two_panels(r["note"])

# Example: show notes 25–66 (inclusive of 25, exclusive of 67)
show_panels_range(1, 25)




Clinical Note
--------------------------------------------------------------------------------
A 21-year-old male was involved in a high-speed motor vehicle collision and sustained multiple injuries, including a right closed subtrochanteric femur fracture, bilateral pulmonary contusions, as well as a splenic injury requiring an exploratory laparotomy with splenectomy upon arrival to the hospital. Due to hemodynamic instability, a distal femoral traction pin with 25 pounds of weight was placed in his right femur on the date of admission as a temporary stabilization of his fracture. This smooth traction pin was placed without difficulty using sterile technique and was placed from medial to lateral at the level of the adductor tubercle. The pin sites were then covered with a sterile dressing. Radiographs of his fracture are shown in .
The patient's overall condition worsened over the ensuing three days, with development of high fevers, increasing leukocytosis, and continued cardiopulmon

In [None]:
import json, os, re
from pathlib import Path

jsonl_path = "/content/ner_outputs/agbonnet_ner_test100.jsonl"  # adjust if needed

rows = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        try:
            rec = json.loads(line)
        except Exception as e:
            print(f"Skipping bad line {i}: {e}")
            continue

        note_id = rec.get("note_id", i)
        note_txt = rec.get("note", "")
        spans = rec.get("spans", [])

        # If you saved normalized objects already, they might be in rec["normalized"]
        norm_objs = rec.get("normalized", None)

        if norm_objs and isinstance(norm_objs, list):
            # expect each normalized item to correspond to a span
            for s, n in zip(spans, norm_objs):
                rows.append({
                    "note_id": note_id,
                    "mention": s.get("text") or s.get("span_text") or "",
                    "label": s.get("label") or s.get("Label") or "",
                    "normalized_name": n.get("name") or n.get("term") or "",
                    "cui": n.get("cui") or "",
                })
        else:
            # no normalization present -> just capture the spans; normalized_name left blank
            for s in spans:
                rows.append({
                    "note_id": note_id,
                    "mention": s.get("text") or s.get("span_text") or "",
                    "label": s.get("label") or s.get("Label") or "",
                    "normalized_name": "",  # not available in the file
                    "cui": "",
                })

len(rows)


0

In [None]:
import json
import pandas as pd
import re

jsonl_path = "/content/ner_outputs/agbonnet_ner_test100.jsonl"

rows = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        note_id = rec.get("idx", None)
        entities = rec.get("entities", [])
        for ent in entities:
            rows.append({
                "note_id": note_id,
                "mention": ent.get("text", "").strip(),
                "label": ent.get("label", "")
            })

df = pd.DataFrame(rows)
print(df.head(20))
print("Total rows:", len(df))


   note_id               mention                   label
0   175078                    15                     Age
1   175078                     -                     Age
2   175078                  year                    Date
3   175078                     -                     Age
4   175078                   old                     Sex
5   175078                   boy                     Sex
6   175078              admitted          Clinical_event
7   175078  emergency department  Nonbiological_location
8   175078                 blunt    Detailed_description
9   175078             abdominal    Biological_structure
10  175078                trauma            Sign_symptom
11  175078                   hit            Sign_symptom
12  175078                   car            Sign_symptom
13  175078             intubated   Therapeutic_procedure
14  175078      haemodynamically            Sign_symptom
15  175078        blood pressure    Diagnostic_procedure
16  175078      laboratory test

In [None]:
def is_acronym(s: str) -> bool:
    if not s or len(s) < 2:
        return False
    if len(s) > 15:
        return False
    if re.fullmatch(r"^[A-Z0-9][A-Z0-9\-/\.]*$", s):
        return True
    wl = {"bp","hr","po2","ph","x-ray"}
    return s.lower() in wl

acronyms_df = df[df["mention"].map(is_acronym)].copy().reset_index(drop=True)
print(acronyms_df.head(20))
print("Found acronyms:", len(acronyms_df))


   note_id mention                 label
0   175078      15                   Age
1   175078      14             Lab_value
2    39915      21                   Age
3    39915      25              Duration
4    82868      40                   Age
5    82868      60             Lab_value
6    40731      13                   Age
7   150221      29                   Age
8   150221      39                  Date
9   150221      15                  Date
10  142808      31                   Age
11  142808      20              Duration
12  174618      29                   Age
13  142606      46             Lab_value
14  180817      77                   Age
15  103706      66                   Age
16  103706      30              Duration
17   42714      69                   Age
18   42714      bp  Diagnostic_procedure
19   42714     140             Lab_value
Found acronyms: 228


In [None]:
out_csv = "/content/agbonnet_acronyms.csv"
acronyms_df["gold_term"] = ""   # empty column for you to fill in manually
acronyms_df.to_csv(out_csv, index=False)

from google.colab import files
files.download(out_csv)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import json, re, pandas as pd

jsonl_path = "/content/ner_outputs/agbonnet_ner_test100.jsonl"  # <-- your file

rows = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        nid = rec.get("idx")
        for ent in rec.get("entities", []):
            rows.append({
                "note_id": nid,
                "mention": (ent.get("text") or "").strip(),
                "label": ent.get("label") or ""
            })

df = pd.DataFrame(rows)
print("All mentions:", len(df))
df.head()


All mentions: 7162


Unnamed: 0,note_id,mention,label
0,175078,15,Age
1,175078,-,Age
2,175078,year,Date
3,175078,-,Age
4,175078,old,Sex


In [None]:
WHITELIST = {
    "bp","hr","htn","dm","chf","cad","copd","ckd","hcg","tace","ert","rpr",
    "ecg","ekg","ct","mri","pet","dsa","avm","csf","us","t1w","t2w"
}

def looks_like_acronym(s: str) -> bool:
    if not s: return False
    s = s.strip()
    if len(s) < 2 or len(s) > 15: return False
    if s.isdigit(): return False
    if " " in s: return False
    if re.search(r"[A-Za-z]", s) is None:  # must contain a letter
        return False

    letters = "".join(ch for ch in s if ch.isalpha())
    upper_ratio = sum(ch.isupper() for ch in letters) / len(letters) if letters else 0.0
    # Accept mostly-uppercase short tokens OR whitelist common lowercase clinical acronyms
    return (upper_ratio >= 0.6) or (s.lower() in WHITELIST)

acronyms_df = df[df["mention"].map(looks_like_acronym)].copy()
# Optional: drop labels that are never acronyms if you wish
# acronyms_df = acronyms_df[~acronyms_df["label"].isin(["Age","Date","Duration","Lab_value"])]

# Deduplicate exact (note_id, mention, label) rows
acronyms_df.drop_duplicates(["note_id","mention","label"], inplace=True)

print("Acronym rows:", len(acronyms_df))
acronyms_df.head(20)


Acronym rows: 84


Unnamed: 0,note_id,mention,label
17,175078,ct,Diagnostic_procedure
457,142606,mri,Diagnostic_procedure
590,180817,ct,Diagnostic_procedure
623,103706,pet,Diagnostic_procedure
634,103706,ct,Diagnostic_procedure
693,42714,bp,Diagnostic_procedure
722,42714,ct,Diagnostic_procedure
833,186791,ct,Diagnostic_procedure
921,13709,ct,Diagnostic_procedure
952,173721,ct,Diagnostic_procedure


In [None]:
# === Make an AGBonnet acronyms sheet from in-memory notes ===
import os, re, pandas as pd

# 1) POINT THIS to your list of note texts
#    (replace 'notes' if your variable is named differently)
NOTE_LIST = notes  # e.g., a list[str] of the 40 AGBonnet notes

# 2) Where to save
OUT_DIR  = "/content/drive/MyDrive/AGBONNET_Acronym"
CSV_OUT  = os.path.join(OUT_DIR, "agbonnet_acronyms.csv")
XLSX_OUT = os.path.join(OUT_DIR, "agbonnet_acronyms.xlsx")
os.makedirs(OUT_DIR, exist_ok=True)

# 3) Acronym detector
ACRONYM_RE = re.compile(r"(?:[A-Z]{2,6}(?:\d{0,3})?)$")
STOP = {"I","II","III","IV","V","S","T","A","O","X"}   # tweak if needed
def is_acronym(t: str) -> bool:
    if not isinstance(t, str): return False
    t = t.strip()
    if t in STOP: return False
    if ACRONYM_RE.fullmatch(t) and sum(c.isupper() for c in t) >= 2:
        return True
    return False

def best_norm(norm_item):
    """Return (name, cui) from whatever normalize_span returns (dict / list / str)."""
    if not norm_item: return "", ""
    if isinstance(norm_item, dict):
        name = norm_item.get("name") or norm_item.get("expanded") or norm_item.get("term") or ""
        return name, norm_item.get("cui","")
    if isinstance(norm_item, list) and norm_item:
        first = norm_item[0]
        if isinstance(first, dict):
            name = first.get("name") or first.get("expanded") or first.get("term") or ""
            return name, first.get("cui","")
        if isinstance(first, str):
            return first, ""
    if isinstance(norm_item, str):
        return norm_item, ""
    return "", ""

rows = []
for i, note in enumerate(NOTE_LIST):
    # prefer your chunked predictor if long notes
    try:
        spans = predict_spans_chunked(note)
    except NameError:
        spans = predict_spans(note)

    for sp in spans:
        mention = sp.get("mention") or sp.get("text") or ""
        if not is_acronym(mention):
            continue
        # normalize the single span (you already have normalize_span in your pipeline)
        norm_item = normalize_span(sp, k=5)  # returns list/dict/string depending on your setup
        norm_name, norm_cui = best_norm(norm_item)
        rows.append({
            "note_id": i,
            "mention": mention,
            "normalized": norm_name,
            "cui": norm_cui,
            "label": sp.get("label","")
        })

df = pd.DataFrame(rows).drop_duplicates().reset_index(drop=True)
df.to_csv(CSV_OUT, index=False)
df.to_excel(XLSX_OUT, index=False)
print(f"Saved CSV  -> {CSV_OUT}")
print(f"Saved XLSX -> {XLSX_OUT}")
print(df.head(10).to_string(index=False))


UMLS

In [None]:
!pip -q install datasets pandas

from datasets import load_dataset
import pandas as pd
pd.set_option("display.max_colwidth", 200)


In [None]:
ds = load_dataset("adlbh/umls-concepts")
ds


DatasetDict({
    train: Dataset({
        features: ['ENTITY', 'DEFINITION', 'ALIASES', 'NAME'],
        num_rows: 474872
    })
})

In [None]:
print("Columns:", ds["train"].column_names)
print("\nOne row:\n", ds["train"][0])
print("\nTotal rows:", len(ds["train"]))


Columns: ['ENTITY', 'DEFINITION', 'ALIASES', 'NAME']

One row:
 {'ENTITY': 'C0003725', 'DEFINITION': 'Arthropod-borne viruses. A non-taxonomic designation for viruses that can replicate in both vertebrate hosts and arthropod vectors. Included are some members of the following families: ARENAVIRIDAE; BUNYAVIRIDAE; REOVIRIDAE; TOGAVIRIDAE; and FLAVIVIRIDAE. (From Dictionary of Microbiology and Molecular Biology, 2nd ed)', 'ALIASES': 'Arbovirus (navigational concept)|arbovirus|Arboviruses|Arthropod-borne Virus|Viruses, Arthropod-Borne|ARBOVIRUS|arboviruses|Arbovirus|Arthropod Borne Viruses|Virus, Arthropod-Borne|Arbovirus, NOS|Arthropod-Borne Viruses|Arthropod-Borne Virus', 'NAME': 'Arboviruses'}

Total rows: 474872


In [None]:
df = ds["train"].to_pandas()
df.head(40)


Unnamed: 0,ENTITY,DEFINITION,ALIASES,NAME
0,C0003725,Arthropod-borne viruses. A non-taxonomic designation for viruses that can replicate in both vertebrate hosts and arthropod vectors. Included are some members of the following families: ARENAVIRIDA...,"Arbovirus (navigational concept)|arbovirus|Arboviruses|Arthropod-borne Virus|Viruses, Arthropod-Borne|ARBOVIRUS|arboviruses|Arbovirus|Arthropod Borne Viruses|Virus, Arthropod-Borne|Arbovirus, NOS|...",Arboviruses
1,C0039258,,Tahyna virus (organism)|Tahyna virus,Tahyna virus
2,C0318627,,Eyach virus|Eyach virus (organism),Eyach virus
3,C0012634,"A definite pathologic process with a characteristic set of signs and symptoms. It may affect the whole body or any of its parts, and its etiology, pathology, and prognosis may be known or unknown.","Disease|Clinical disease or syndrome|Clinical disease or syndrome present, NOS|disorder|Disorders|Disease or syndrome present|disease|Diseases and Disorders|Clinical disease AND/OR syndrome presen...",Disease
4,C0042776,"Minute infectious agents whose genomes are composed of DNA or RNA, but not both. They are characterized by a lack of independent metabolism and the inability to replicate outside living host cells.","Virus|Virus, NOS|Viruses, General|Virus (organism)|Vira|Viridae|VIRUSES|viruses|Viruses|viridae|virus|VIRUS",Virus
5,C0999630,,Lepus capensis|Lepus capensis (organism)|Cape hare|Brown hare,Lepus capensis (organism)
6,C0242210,General term for proteins that have binding as a major function.,"Ligand Binding Protein|Binding Protein|binding proteins|binding protein|Proteins, Binding|Protein, Binding|Binding protein (substance)|Binding protein",Binding Proteins
7,C0053075,,,beidellite
8,C0053075,,,beidellite
9,C0242210,General term for proteins that have binding as a major function.,"Ligand Binding Protein|Binding Protein|binding proteins|binding protein|Proteins, Binding|Protein, Binding|Binding protein (substance)|Binding protein",Binding Proteins


In [None]:
# Simple text search in NAME / ALIASES / DEFINITION
def find_by_text(q, limit=25, case=False):
    f = df
    if not case:
        q = q.lower()
        mask = (
            f["NAME"].str.lower().str.contains(q, na=False) |
            f["ALIASES"].astype(str).str.lower().str.contains(q, na=False) |
            f["DEFINITION"].str.lower().str.contains(q, na=False)
        )
    else:
        mask = (
            f["NAME"].str.contains(q, na=False) |
            f["ALIASES"].astype(str).str.contains(q, na=False) |
            f["DEFINITION"].str.contains(q, na=False)
        )
    out = f.loc[mask, ["ENTITY", "NAME", "ALIASES", "DEFINITION"]].head(limit)
    return out

def find_by_cui(cui):
    return df.loc[df["ENTITY"] == cui, ["ENTITY", "NAME", "ALIASES", "DEFINITION"]]


In [None]:
find_by_text("ct", limit=20)


Unnamed: 0,ENTITY,NAME,ALIASES,DEFINITION
0,C0003725,Arboviruses,"Arbovirus (navigational concept)|arbovirus|Arboviruses|Arthropod-borne Virus|Viruses, Arthropod-Borne|ARBOVIRUS|arboviruses|Arbovirus|Arthropod Borne Viruses|Virus, Arthropod-Borne|Arbovirus, NOS|...",Arthropod-borne viruses. A non-taxonomic designation for viruses that can replicate in both vertebrate hosts and arthropod vectors. Included are some members of the following families: ARENAVIRIDA...
3,C0012634,Disease,"Disease|Clinical disease or syndrome|Clinical disease or syndrome present, NOS|disorder|Disorders|Disease or syndrome present|disease|Diseases and Disorders|Clinical disease AND/OR syndrome presen...","A definite pathologic process with a characteristic set of signs and symptoms. It may affect the whole body or any of its parts, and its etiology, pathology, and prognosis may be known or unknown."
4,C0042776,Virus,"Virus|Virus, NOS|Viruses, General|Virus (organism)|Vira|Viridae|VIRUSES|viruses|Viruses|viridae|virus|VIRUS","Minute infectious agents whose genomes are composed of DNA or RNA, but not both. They are characterized by a lack of independent metabolism and the inability to replicate outside living host cells."
6,C0242210,Binding Proteins,"Ligand Binding Protein|Binding Protein|binding proteins|binding protein|Proteins, Binding|Protein, Binding|Binding protein (substance)|Binding protein",General term for proteins that have binding as a major function.
9,C0242210,Binding Proteins,"Ligand Binding Protein|Binding Protein|binding proteins|binding protein|Proteins, Binding|Protein, Binding|Binding protein (substance)|Binding protein",General term for proteins that have binding as a major function.
10,C0004611,Bacteria,"Unknown eubacteria|Bacterium sp.|unidentified bacterium|bacteria|eubacteria|Eubacteria|Bacteria|unidentified bacteria|unclassified bacterium|Superkingdom Bacteria|Bacteria bacterium|Bacterium, NOS...","One of the three domains of life (the others being Eukarya and ARCHAEA), also called Eubacteria. They are unicellular prokaryotic microorganisms which generally possess rigid cell walls, multiply ..."
11,C0031516,Pheromone,pheromones|Recognition odour|Recognition odor|Scent|scent|Pheromone|pheromone|scented|scents|Pheromones|Recognition odor (finding),"Chemical substances, excreted by an organism into the environment, that elicit behavioral or physiological responses from other organisms of the same species. Perception of these chemical signals ..."
12,C1167395,Host (organism),hosts|host organism|host|Host,"Any organism in which another organism, especially a parasite or symbiont, spends part or all of its life cycle and from which it obtains nourishment and/or protection. [ISBN:0198506732]"
15,C0059374,enrofloxacin,"Enrofloxacin|Enrofloxacin-containing product|Product containing enrofloxacin (medicinal product)|enrofloxacin|ENROFLOXACIN|3-Quinolinecarboxylic acid, 1-cyclopropyl-7-(4-ethyl-1-piperazinyl)-6-flu...",A fluoroquinolone antibacterial and antimycoplasma agent that is used in veterinary practice.
16,C0180153,Covers (device),coverings|Cover Device|cover [device]|cover|covering|covers|Cover,"An object designed to conceal, enclose, cap, or protect something."


In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("agonnet_gold.csv")

# Inspect column names
print(df.columns.tolist())


['note_id', 'Acronyms', 'Dirag_Inference ', 'gold_name']


In [None]:
import pandas as pd

# Load CSV
df = pd.read_csv("agonnet_gold.csv")

# Strip spaces from column names
df.columns = df.columns.str.strip()
print(df.columns.tolist())   # confirm cleaned names

# Now you can safely access
pred_col = "Dirag_Inference"
gold_col = "gold_name"

# Drop NA rows
df = df.dropna(subset=[pred_col, gold_col])

# Compute metrics
correct = (df[pred_col] == df[gold_col]).sum()
total = len(df)

accuracy = correct / total

tp = correct
fp = ((df[pred_col] != df[gold_col]) & (df[pred_col].notna())).sum()
fn = ((df[pred_col] != df[gold_col]) & (df[gold_col].notna())).sum()

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1: {f1:.3f}")


['note_id', 'Acronyms', 'Dirag_Inference', 'gold_name']
Accuracy: 0.636
Precision: 0.636
Recall: 0.636
F1: 0.636


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load CSV
df = pd.read_csv("agonnet_gold.csv")

# Strip whitespace from column names
df.columns = df.columns.str.strip()

print("Columns:", df.columns.tolist())  # check cleaned names

# True (gold) vs predicted
y_true = df["gold_name"].astype(str).tolist()
y_pred = df["Dirag_Inference"].astype(str).tolist()

# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Multi-class metrics
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1:        {f1:.3f}")


Columns: ['note_id', 'Acronyms', 'Dirag_Inference', 'gold_name']
Accuracy:  0.636
Precision: 0.461
Recall:    0.475
F1:        0.464


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load CSV
df = pd.read_csv("maccrobat_gold.csv")

# Strip whitespace from column names
df.columns = df.columns.str.strip()

print("Columns:", df.columns.tolist())  # check cleaned names

# True (gold) vs predicted
y_true = df["gold_term"].astype(str).tolist()
y_pred = df["normalized"].astype(str).tolist()

# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Multi-class metrics
precision = precision_score(y_true, y_pred, average="macro")
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy:  {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1:        {f1:.3f}")


Columns: ['note_id', 'mention', 'normalized', 'gold_term']
Accuracy:  0.707
Precision: 0.486
Recall:    0.475
F1:        0.479


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
