In [1]:
import json
import spacy


# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
print(" NER model loadedï¼šen_core_web_sm")


# --------------------------
# 2. read json
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"

with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]  # list of [text, ann]


# --------------------------
# 3. NER
# --------------------------
def run_spacy_inference():
    results = []

    for item in annotations:
        text = item[0]

        doc = nlp(text)
        ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

        results.append({
            "text": text,
            "spacy_entities": ents
        })

    return results


# --------------------------
# 4. excute and print
# --------------------------
output = run_spacy_inference()

print("\n===== showcaseï¼ˆtop 5ï¼‰===== \n")

for i, item in enumerate(output[:5]):
    print(f"Sentence {i+1}:")
    print("Text:", item["text"])
    print("spaCy NER:", item["spacy_entities"])
    print("-" * 80)


import json
with open("spacy_standard_predictions.json", "w") as f:
    json.dump(output, f, indent=2)

print("\n inference results saved toï¼šspacy_standard_predictions.json")


 NER model loadedï¼šen_core_web_sm

===== showcaseï¼ˆtop 5ï¼‰===== 

Sentence 1:
Text: The lungs are otherwise clear with no evidence of focal opacities concerning for infectious process.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 2:
Text: Right PICC line again extends to the cavoatrial junction.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 3:
Text: Streaky opacity at the left lung base thought likely atelectatic in etiology.
spaCy NER: [('Streaky', 0, 7, 'PERSON')]
--------------------------------------------------------------------------------
Sentence 4:
Text: No effusion or pneumothorax.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 5:
Text: There is mild obscuration of the right cardiac border, however, no definite densities are appreciated on the lateral view.
spaCy NER: []
-------------------------------

In [2]:
import json
import random
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
from sklearn.model_selection import train_test_split
from pathlib import Path

# --------------------------
# 1. load json
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"

with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]  # list of [text, {"entities":[...] }]
print("total samples:", len(annotations))

# --------------------------
# 2. dataset splitï¼š70% training / 30% evaluation
# --------------------------
train_data, eval_data = train_test_split(
    annotations, test_size=0.30, random_state=42
)

print(f"training set {len(train_data)} ï¼Œevaluation set {len(eval_data)}")

# --------------------------
# 3. create spaCy NER label system
# --------------------------
nlp = spacy.blank("en")   # null model
ner = nlp.add_pipe("ner")

# æ”¶é›†æ‰€æœ‰æ ‡ç­¾
all_labels = set()
for text, ann in annotations:
    for start, end, label in ann["entities"]:
        all_labels.add(label)

for label in all_labels:
    ner.add_label(label)

print("load entity labels:", all_labels)


# --------------------------
# 4. convert data to spaCy DocBin
# --------------------------
def convert_to_spacy(data, output_path):
    db = DocBin()
    for text, ann in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)
    print("saved to:", output_path)

convert_to_spacy(train_data, "train.spacy")
convert_to_spacy(eval_data, "eval.spacy")


# --------------------------
# 5. Train Ner model
# --------------------------
optimizer = nlp.initialize()

epochs = 20
for epoch in range(epochs):
    random.shuffle(train_data)

    losses = {}
    for text, ann in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"entities": ann["entities"]})
        nlp.update([example], sgd=optimizer, losses=losses)

    print(f"Epoch {epoch+1}/{epochs}  Losses={losses}")

nlp.to_disk("ner_model")
print("\n saved to ner_model/")


# --------------------------
# 6. evaluation
# --------------------------
from collections import Counter

def evaluate(nlp, eval_data):
    tp = Counter()
    fp = Counter()
    fn = Counter()

    for text, ann in eval_data:
        doc = nlp(text)

        gold_ents = {(start, end, label) for start, end, label in ann["entities"]}
        pred_ents = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

        for ent in pred_ents:
            if ent in gold_ents:
                tp[ent[2]] += 1
            else:
                fp[ent[2]] += 1

        for ent in gold_ents:
            if ent not in pred_ents:
                fn[ent[2]] += 1

    print("\n===== Evaluation Results =====")
    for label in all_labels:
        p = tp[label] / (tp[label] + fp[label] + 1e-8)
        r = tp[label] / (tp[label] + fn[label] + 1e-8)
        f1 = 2 * p * r / (p + r + 1e-8)
        print(f"{label:12s}  P={p:.4f}  R={r:.4f}  F1={f1:.4f}")


evaluate(nlp, eval_data)


total samples: 100
training set 70 ï¼Œevaluation set 30
load entity labels: {'DESCRIPTION', 'FINDING', 'OTHER', 'ANATOMY', 'LOCATION'}
saved to: train.spacy
saved to: eval.spacy




Epoch 1/20  Losses={'ner': 570.0405077276259}
Epoch 2/20  Losses={'ner': 435.57427351048045}
Epoch 3/20  Losses={'ner': 355.1196583452005}
Epoch 4/20  Losses={'ner': 291.6339738306977}
Epoch 5/20  Losses={'ner': 238.81171503647158}
Epoch 6/20  Losses={'ner': 193.93953842442215}
Epoch 7/20  Losses={'ner': 157.47429326391105}
Epoch 8/20  Losses={'ner': 122.23638750723488}
Epoch 9/20  Losses={'ner': 105.82948345383785}
Epoch 10/20  Losses={'ner': 109.20330382802553}
Epoch 11/20  Losses={'ner': 88.21925424025275}
Epoch 12/20  Losses={'ner': 96.38008879421672}
Epoch 13/20  Losses={'ner': 85.92706416810691}
Epoch 14/20  Losses={'ner': 68.22198183644316}
Epoch 15/20  Losses={'ner': 70.72751916004425}
Epoch 16/20  Losses={'ner': 56.650038067260866}
Epoch 17/20  Losses={'ner': 63.05612962746928}
Epoch 18/20  Losses={'ner': 49.03098449966949}
Epoch 19/20  Losses={'ner': 54.0543704653914}
Epoch 20/20  Losses={'ner': 36.38526339944105}

 saved to ner_model/

===== Evaluation Results =====
DESCRIPT

In [4]:
import json
import random
import spacy
from pathlib import Path
from termcolor import colored

# --------------------------
# 1. load model
# --------------------------
nlp = spacy.load("ner_model")
print("model loadedï¼šner_model/")

# --------------------------
# 2. read in
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"
with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]

# 70/30 
random.seed(42)
random.shuffle(annotations)
split = int(len(annotations) * 0.7)
eval_data = annotations[split:]

print("Evaluation sample number:", len(eval_data))


# --------------------------
# 3.tool box
# --------------------------
def format_span(text, start, end, label, color):
    return (
        text[:start]
        + colored(text[start:end], color)
        + text[end:]
        + f" <{label}>"
    )

def showcase_one(text, gold_ents, pred_ents):
    print("\n========================================")
    print("TEXT:")
    print(text)
    print("----------------------------------------")

    print("GOLD:")
    for (s, e, label) in gold_ents:
        print(f"  {text[s:e]} [{label}] (pos {s}-{e})")

    print("\nPRED:")
    for (s, e, label) in pred_ents:
        print(f"  {text[s:e]} [{label}] (pos {s}-{e})")

    # TP, FP, FN
    gold_set = set(gold_ents)
    pred_set = set(pred_ents)

    tp = gold_set & pred_set
    fp = pred_set - gold_set
    fn = gold_set - pred_set

    print("\nMATCH SUMMARY:")
    print("ðŸŸ© TP =", len(tp), "ðŸŸ¦ FP =", len(fp), "ðŸŸ¥ FN =", len(fn))


    def highlight_all():
        chunks = []
        for s, e, label in sorted(tp, key=lambda x: x[0]):
            print(colored(f"TP: {text[s:e]} [{label}]", "green"))

        for s, e, label in sorted(fp, key=lambda x: x[0]):
            print(colored(f"FP: {text[s:e]} [{label}]", "blue"))

        for s, e, label in sorted(fn, key=lambda x: x[0]):
            print(colored(f"FN: {text[s:e]} [{label}]", "red"))

    highlight_all()


# --------------------------
# 4. Showcase 
# --------------------------
N = 5  
samples = random.sample(eval_data, N)

for text, ann in samples:
    gold = [(s, e, label) for s, e, label in ann["entities"]]
    doc = nlp(text)
    pred = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    showcase_one(text, gold, pred)


model loadedï¼šner_model/
Evaluation sample number: 30

TEXT:
In comparison with the study of ___, there again are low lung volumes that accentuate the prominence of the transverse diameter of the heart.
----------------------------------------
GOLD:
  In [OTHER] (pos 0-2)
  there [OTHER] (pos 37-42)
  again [OTHER] (pos 43-48)
  lung [ANATOMY] (pos 57-61)
  heart [ANATOMY] (pos 135-140)
  comparison [OTHER] (pos 3-13)
  with [OTHER] (pos 14-18)
  the [OTHER] (pos 19-22)
  accentuate [DESCRIPTION] (pos 75-85)
  prominence [DESCRIPTION] (pos 90-100)
  low  [FINDING] (pos 53-57)
  volumes [FINDING] (pos 62-69)
  study [OTHER] (pos 23-28)
   of [OTHER] (pos 28-31)
  ___ [OTHER] (pos 32-35)
  are [OTHER] (pos 49-52)
  that [OTHER] (pos 70-74)
  the [OTHER] (pos 86-89)
  of [OTHER] (pos 101-103)
  of [OTHER] (pos 128-130)
  the [OTHER] (pos 131-134)
  the  [OTHER] (pos 104-108)
  transverse diameter [LOCATION] (pos 108-127)

PRED:
  In comparison [OTHER] (pos 0-13)
  with [OTHER] (pos 14-18