In [None]:
import json
import spacy


# python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
print(" NER model loadedÔºöen_core_web_sm")


# --------------------------
# 2. read json
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"

with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]  # list of [text, ann]


# --------------------------
# 3. NER
# --------------------------
def run_spacy_inference():
    results = []

    for item in annotations:
        text = item[0]

        doc = nlp(text)
        ents = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

        results.append({
            "text": text,
            "spacy_entities": ents
        })

    return results


# --------------------------
# 4. excute and print
# --------------------------
output = run_spacy_inference()

print("\n===== showcaseÔºàtop 5Ôºâ===== \n")

for i, item in enumerate(output[:5]):
    print(f"Sentence {i+1}:")
    print("Text:", item["text"])
    print("spaCy NER:", item["spacy_entities"])
    print("-" * 80)


# --------------------------
# 5. ‰øùÂ≠òÂÆåÊï¥ËæìÂá∫Âà∞Êñá‰ª∂
# --------------------------
import json
with open("spacy_standard_predictions.json", "w") as f:
    json.dump(output, f, indent=2)

print("\nÂÖ®ÈÉ®Êé®ÁêÜÁªìÊûúÂ∑≤‰øùÂ≠òÂà∞Ôºöspacy_standard_predictions.json")


Â∑≤Âä†ËΩΩ spaCy ÈªòËÆ§ NER Ê®°ÂûãÔºöen_core_web_sm

===== Á§∫‰æãËæìÂá∫ÔºàÂâç 5 Êù°Ôºâ===== 

Sentence 1:
Text: The lungs are otherwise clear with no evidence of focal opacities concerning for infectious process.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 2:
Text: Right PICC line again extends to the cavoatrial junction.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 3:
Text: Streaky opacity at the left lung base thought likely atelectatic in etiology.
spaCy NER: [('Streaky', 0, 7, 'PERSON')]
--------------------------------------------------------------------------------
Sentence 4:
Text: No effusion or pneumothorax.
spaCy NER: []
--------------------------------------------------------------------------------
Sentence 5:
Text: There is mild obscuration of the right cardiac border, however, no definite densities are appreciated on the lateral view.
spaCy NER: []
-------

In [3]:
import json
import random
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
from sklearn.model_selection import train_test_split
from pathlib import Path

# --------------------------
# 1. Âä†ËΩΩ JSON Êñá‰ª∂
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"

with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]  # list of [text, {"entities":[...] }]
print("ÊÄªÊ†∑Êú¨Êï∞Èáè:", len(annotations))

# --------------------------
# 2. Êï∞ÊçÆÈõÜÂàíÂàÜÔºö70% ËÆ≠ÁªÉ / 30% ËØÑ‰º∞
# --------------------------
train_data, eval_data = train_test_split(
    annotations, test_size=0.30, random_state=42
)

print(f"ËÆ≠ÁªÉÈõÜ {len(train_data)} Êù°ÔºåËØÑ‰º∞ÈõÜ {len(eval_data)} Êù°")

# --------------------------
# 3. ÊûÑÂª∫ spaCy NER Ê†áÁ≠æ‰ΩìÁ≥ª
# --------------------------
nlp = spacy.blank("en")   # ‰ΩøÁî®Á©∫Ê®°Âûã
ner = nlp.add_pipe("ner")

# Êî∂ÈõÜÊâÄÊúâÊ†áÁ≠æ
all_labels = set()
for text, ann in annotations:
    for start, end, label in ann["entities"]:
        all_labels.add(label)

for label in all_labels:
    ner.add_label(label)

print("Âä†ÂÖ•ÂÆû‰ΩìÊ†áÁ≠æ:", all_labels)


# --------------------------
# 4. ËΩ¨Êç¢Êï∞ÊçÆ‰∏∫ spaCy DocBin
# --------------------------
def convert_to_spacy(data, output_path):
    db = DocBin()
    for text, ann in data:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in ann["entities"]:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)
    print("‰øùÂ≠òÊàêÂäü:", output_path)

convert_to_spacy(train_data, "train.spacy")
convert_to_spacy(eval_data, "eval.spacy")


# --------------------------
# 5. ËÆ≠ÁªÉ NER Ê®°Âûã
# --------------------------
optimizer = nlp.initialize()

epochs = 20
for epoch in range(epochs):
    random.shuffle(train_data)

    losses = {}
    for text, ann in train_data:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"entities": ann["entities"]})
        nlp.update([example], sgd=optimizer, losses=losses)

    print(f"Epoch {epoch+1}/{epochs}  Losses={losses}")

# ‰øùÂ≠òÊ®°Âûã
nlp.to_disk("ner_model")
print("\nÊ®°ÂûãÂ∑≤‰øùÂ≠òÂà∞ ner_model/")


# --------------------------
# 6. Âú® evaluation ÈõÜ‰∏äËøõË°åËØÑ‰º∞
# --------------------------
from collections import Counter

def evaluate(nlp, eval_data):
    tp = Counter()
    fp = Counter()
    fn = Counter()

    for text, ann in eval_data:
        doc = nlp(text)

        gold_ents = {(start, end, label) for start, end, label in ann["entities"]}
        pred_ents = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

        for ent in pred_ents:
            if ent in gold_ents:
                tp[ent[2]] += 1
            else:
                fp[ent[2]] += 1

        for ent in gold_ents:
            if ent not in pred_ents:
                fn[ent[2]] += 1

    print("\n===== Evaluation Results =====")
    for label in all_labels:
        p = tp[label] / (tp[label] + fp[label] + 1e-8)
        r = tp[label] / (tp[label] + fn[label] + 1e-8)
        f1 = 2 * p * r / (p + r + 1e-8)
        print(f"{label:12s}  P={p:.4f}  R={r:.4f}  F1={f1:.4f}")


evaluate(nlp, eval_data)


ÊÄªÊ†∑Êú¨Êï∞Èáè: 100
ËÆ≠ÁªÉÈõÜ 70 Êù°ÔºåËØÑ‰º∞ÈõÜ 30 Êù°
Âä†ÂÖ•ÂÆû‰ΩìÊ†áÁ≠æ: {'LOCATION', 'ANATOMY', 'FINDING', 'DESCRIPTION', 'OTHER'}
‰øùÂ≠òÊàêÂäü: train.spacy
‰øùÂ≠òÊàêÂäü: eval.spacy




Epoch 1/20  Losses={'ner': 574.5058591766725}
Epoch 2/20  Losses={'ner': 433.9428211705683}
Epoch 3/20  Losses={'ner': 354.1707157610513}
Epoch 4/20  Losses={'ner': 294.17886770099926}
Epoch 5/20  Losses={'ner': 243.1009385932815}
Epoch 6/20  Losses={'ner': 209.48549465060808}
Epoch 7/20  Losses={'ner': 163.8420644680413}
Epoch 8/20  Losses={'ner': 134.68289650506037}
Epoch 9/20  Losses={'ner': 116.5384987037183}
Epoch 10/20  Losses={'ner': 85.66910204238084}
Epoch 11/20  Losses={'ner': 81.58852478229201}
Epoch 12/20  Losses={'ner': 86.85293162214982}
Epoch 13/20  Losses={'ner': 81.26762350269799}
Epoch 14/20  Losses={'ner': 64.63349874290319}
Epoch 15/20  Losses={'ner': 60.12567428249123}
Epoch 16/20  Losses={'ner': 67.11685614945976}
Epoch 17/20  Losses={'ner': 54.352081922513385}
Epoch 18/20  Losses={'ner': 50.40005956448662}
Epoch 19/20  Losses={'ner': 54.16552961731887}
Epoch 20/20  Losses={'ner': 42.93299904168329}

Ê®°ÂûãÂ∑≤‰øùÂ≠òÂà∞ ner_model/

===== Evaluation Results =====
LO

In [None]:
import json
import random
import spacy
from pathlib import Path
from termcolor import colored

# --------------------------
# 1. Âä†ËΩΩËÆ≠ÁªÉÂ•ΩÁöÑÊ®°Âûã
# --------------------------
nlp = spacy.load("ner_model")
print("Â∑≤Âä†ËΩΩÊ®°ÂûãÔºöner_model/")

# --------------------------
# 2. ËØªÂèñ evaluation Êï∞ÊçÆÈõÜ
# --------------------------
json_path = "./sentence_chunk_combined_Annotated.json"
with open(json_path, "r") as f:
    data = json.load(f)

annotations = data["annotations"]

# 70/30 ÂàíÂàÜ‰øùÊåÅ‰∏ÄËá¥
random.seed(42)
random.shuffle(annotations)
split = int(len(annotations) * 0.7)
eval_data = annotations[split:]

print("Evaluation Ê†∑Êú¨Êï∞:", len(eval_data))


# --------------------------
# 3. Â∑•ÂÖ∑ÂáΩÊï∞ÔºöÊâìÂç∞ÈáëÊ†á‰∏éÈ¢ÑÊµã
# --------------------------
def format_span(text, start, end, label, color):
    return (
        text[:start]
        + colored(text[start:end], color)
        + text[end:]
        + f" <{label}>"
    )

def showcase_one(text, gold_ents, pred_ents):
    print("\n========================================")
    print("TEXT:")
    print(text)
    print("----------------------------------------")

    print("GOLD:")
    for (s, e, label) in gold_ents:
        print(f"  {text[s:e]} [{label}] (pos {s}-{e})")

    print("\nPRED:")
    for (s, e, label) in pred_ents:
        print(f"  {text[s:e]} [{label}] (pos {s}-{e})")

    # TP, FP, FN
    gold_set = set(gold_ents)
    pred_set = set(pred_ents)

    tp = gold_set & pred_set
    fp = pred_set - gold_set
    fn = gold_set - pred_set

    print("\nMATCH SUMMARY:")
    print("üü© TP =", len(tp), "üü¶ FP =", len(fp), "üü• FN =", len(fn))


    def highlight_all():
        chunks = []
        for s, e, label in sorted(tp, key=lambda x: x[0]):
            print(colored(f"TP: {text[s:e]} [{label}]", "green"))

        for s, e, label in sorted(fp, key=lambda x: x[0]):
            print(colored(f"FP: {text[s:e]} [{label}]", "blue"))

        for s, e, label in sorted(fn, key=lambda x: x[0]):
            print(colored(f"FN: {text[s:e]} [{label}]", "red"))

    highlight_all()


# --------------------------
# 4. Showcase 
# --------------------------
N = 5  
samples = random.sample(eval_data, N)

for text, ann in samples:
    gold = [(s, e, label) for s, e, label in ann["entities"]]
    doc = nlp(text)
    pred = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    showcase_one(text, gold, pred)


Â∑≤Âä†ËΩΩÊ®°ÂûãÔºöner_model/
Evaluation Ê†∑Êú¨Êï∞: 30

TEXT:
In comparison with the study of ___, there again are low lung volumes that accentuate the prominence of the transverse diameter of the heart.
----------------------------------------
GOLD:
  In [OTHER] (pos 0-2)
  there [OTHER] (pos 37-42)
  again [OTHER] (pos 43-48)
  lung [ANATOMY] (pos 57-61)
  heart [ANATOMY] (pos 135-140)
  comparison [OTHER] (pos 3-13)
  with [OTHER] (pos 14-18)
  the [OTHER] (pos 19-22)
  accentuate [DESCRIPTION] (pos 75-85)
  prominence [DESCRIPTION] (pos 90-100)
  low  [FINDING] (pos 53-57)
  volumes [FINDING] (pos 62-69)
  study [OTHER] (pos 23-28)
   of [OTHER] (pos 28-31)
  ___ [OTHER] (pos 32-35)
  are [OTHER] (pos 49-52)
  that [OTHER] (pos 70-74)
  the [OTHER] (pos 86-89)
  of [OTHER] (pos 101-103)
  of [OTHER] (pos 128-130)
  the [OTHER] (pos 131-134)
  the  [OTHER] (pos 104-108)
  transverse diameter [LOCATION] (pos 108-127)

PRED:
  In [OTHER] (pos 0-2)
  comparison [OTHER] (pos 3-13)
  with