In [1]:
import json
import re
import glob
import os
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
from spacy.tokens import Doc
from spacy.training.example import Example
from spacy.scorer import Scorer
# Custom Written Functions to prepare dataset for NER Training
from data_convert import create_dict_concept_type, find_index

  from .autonotebook import tqdm as notebook_tqdm


DONE JSON
TESTING ALGORITHM


## Convert Training Data from BETH to JSON Format

In [3]:
text_files = glob.glob("datasets/concept_assertion_relation_training_data/beth/txt/*")
concept_files_path = "datasets/concept_assertion_relation_training_data/beth/concept/"
training_data = {'classes' : ['TEST', "TREATMENT", "PROBLEM"], 'annotations' : []}


for filename in text_files:
    temp_dict = {}
    with open(filename, "r") as file:
        text = file.read()
    temp_dict['entities'] = []
    temp_dict['text'] = text
    temp_dict["file_name"] = filename.split("/")[-1]

    concept_filename = filename.split("/")[-1].replace("txt","con")
    concept_path = concept_files_path + concept_filename
    dict_concept_type = create_dict_concept_type(concept_path)
    
    for concept, _type in dict_concept_type.items():
        length_word = len(concept)+1
        indexes = find_index(text,concept)
   
        for index in indexes:
            temp_dict["entities"].append((index[0],index[1]+1,_type.upper()))
    training_data["annotations"].append(temp_dict)
    
if not os.path.exists('datasets/json_files/beth_data.json'):
    with open("datasets/json_files/beth_data.json", "w") as file:
        json.dump(training_data, file, indent=4)
        print("DONE JSON")

## Convert Training Data from Partners to JSON Format

In [4]:
text_files = glob.glob("datasets/concept_assertion_relation_training_data/partners/txt/*")
concept_files_path = "datasets/concept_assertion_relation_training_data/partners/concept/"
training_data = {'classes' : ['TEST', "TREATMENT", "PROBLEM"], 'annotations' : []}


for filename in text_files:
    temp_dict = {}
    with open(filename, "r") as file:
        text = file.read()
    temp_dict['entities'] = []
    temp_dict['text'] = text
    temp_dict["file_name"] = filename.split("/")[-1]

    concept_filename = filename.split("/")[-1].replace("txt","con")
    concept_path = concept_files_path + concept_filename
    dict_concept_type = create_dict_concept_type(concept_path)
    
    for concept, _type in dict_concept_type.items():
        length_word = len(concept)+1
        indexes = find_index(text,concept)
   
        for index in indexes:
            temp_dict["entities"].append((index[0],index[1]+1,_type.upper()))
    training_data["annotations"].append(temp_dict)
    
if not os.path.exists('datasets/json_files/partners_data.json'):
    with open("datasets/json_files/partners_data.json", "w") as file:
        json.dump(training_data, file, indent=4)
        print("DONE JSON")

## Convert Test Data to JSON Format

In [5]:
text_files = glob.glob('datasets/reference_standard_for_test_data/txt/*')
concept_files_path = "datasets/reference_standard_for_test_data/concepts/"
test_data = {'classes' : ['TEST', "TREATMENT", "PROBLEM"], 'annotations' : []}


for filename in text_files:
    temp_dict = {}
    with open(filename, "r") as file:
        text = file.read()
    temp_dict['entities'] = []
    temp_dict['text'] = text
    temp_dict["file_name"] = filename.split("/")[-1]

    concept_filename = filename.split("/")[-1].replace("txt","con")
    concept_path = concept_files_path + concept_filename
    dict_concept_type = create_dict_concept_type(concept_path)
    
    for concept, _type in dict_concept_type.items():
        length_word = len(concept)+1
        indexes = find_index(text,concept)
   
        for index in indexes:
            temp_dict["entities"].append((index[0],index[1]+1,_type.upper()))
    test_data["annotations"].append(temp_dict)
    
if not os.path.exists('datasets/json_files/test_data.json'):
    with open("datasets/json_files/test_data.json", "w") as file:
        json.dump(test_data, file, indent=4)
        print("DONE JSON")

In [6]:
# Import the train dataset from the json file
with open('datasets/json_files/beth_data.json', 'r') as f:
    data = json.load(f)

In [7]:
# Create Training Data in the form of a dictionary from json file
training_data = {'classes': ["TEST", "TREATMENT", "PROBLEM"], 'annotations': []}
for ann in data['annotations']:
    temp_dict = {}
    temp_dict['text'] = ann['text']
    temp_dict['entities'] = []
    for entity in ann['entities']:
        temp_dict['entities'].append((entity[0],entity[1],entity[2]))
    training_data['annotations'].append(temp_dict)
    

In [8]:
# Code to Convert Training Data to Custom Spacy Format Required for Training
nlp = spacy.blank("en")
doc_bin = DocBin()
for training_example in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

#Already created don't run again
if not os.path.exists('training_data.spacy'):
    doc_bin.to_disk("training_data.spacy") # save the docbin object

 15%|█▌        | 11/73 [00:00<00:01, 48.71it/s]

Skipping entity
Skipping entity


 81%|████████  | 59/73 [00:00<00:00, 60.97it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 73/73 [00:01<00:00, 59.43it/s]

Skipping entity
Skipping entity
Skipping entity





## Please Don't Run Below Cell as it is already trained and it takes time.

In [9]:

#Create Final Configuration File for Training from Base Configuration File
if not os.path.exists('config.cfg'):
    ! python -m spacy init fill-config base_config.cfg config.cfg

# Run Custom NER Training using Spacy
if not os.path.exists('model-best'):
    ! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

In [10]:
# Import the train dataset from the json file
with open('datasets/json_files/test_data.json', 'r') as f:
    data = json.load(f)

In [11]:
# Create Test Data in the form of a dictionary from json file
test_data = {'classes': ["TEST", "TREATMENT", "PROBLEM"], 'annotations': []}
for ann in data['annotations']:
    temp_dict = {}
    temp_dict['text'] = ann['text']
    temp_dict['entities'] = []
    for entity in ann['entities']:
        temp_dict['entities'].append((entity[0],entity[1],entity[2]))
    test_data['annotations'].append(temp_dict)

In [12]:
# Code to Convert Testing Data to Custom Spacy Format Required for Evaluation
for testing_example in tqdm(test_data['annotations']):
    text = testing_example['text']
    labels = testing_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

#Already created don't run again
if not os.path.exists('testing_data.spacy'):
    doc_bin.to_disk("testing_data.spacy") # save the docbin object

 26%|██▌       | 67/256 [00:01<00:02, 77.49it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 33%|███▎      | 85/256 [00:01<00:02, 79.20it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 44%|████▍     | 113/256 [00:01<00:01, 84.80it/s]

Skipping entity


 59%|█████▊    | 150/256 [00:02<00:01, 80.28it/s]

Skipping entity


 70%|███████   | 180/256 [00:02<00:00, 82.47it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 82%|████████▏ | 209/256 [00:02<00:00, 85.60it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 97%|█████████▋| 248/256 [00:03<00:00, 90.77it/s]

Skipping entity
Skipping entity
Skipping entity


100%|██████████| 256/256 [00:03<00:00, 75.40it/s]


In [13]:
# Evaluation on the Test Dataset
! python -m spacy evaluate model-best/ ./testing_data.spacy


[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   83.33 
NER R   76.93 
NER F   80.00 
SPEED   34280 

[1m

                P       R       F
PROBLEM     80.83   77.50   79.13
TEST        86.85   77.50   81.91
TREATMENT   83.33   75.57   79.26



In [14]:
DISPLAY_COLORS = {
    "PROBLEM": "#1f77b4",
    "TREATMENT": "#ff7f0e",
    "TEST": "#2ca02c",

}
nlp_ner = spacy.load('model-best')

doc = nlp_ner('''Full term well appearing infant with some facial jaundice , overall pink , warm and well perfused , alert and responsive .
Anterior fontanelle is soft , open and flat .
Ears are normally set .
Red reflexes noted bilaterally .
He had a left pupil that was in unequal in size with a keyhole appearance consistent with a coloboma .
The nares were patent .
Palate was intact .
Mucous membranes were moist and pink .
His neck was supple without masses or bruits .
Lungs were clear to auscultation and equal .
Comfortable respiratory pattern .
Cardiovascular :
Regular rate and rhythm , no murmur .
2 plus femoral pulses were noted .
Abdomens oft with positive bowel sounds .
Genitourinary :
Infant was circumcised in the newborn nursery which was healing well .
Testes were descended bilaterally .
Extremities were pink and well perfused .''')

#print(type(doc.ents[7]))

options = {"colors": DISPLAY_COLORS} 

# Visulation of Entitites from Clinical Text
spacy.displacy.render(doc, style="ent", options= options, jupyter=True)



In [15]:
import scispacy

from scispacy.linking import EntityLinker

ModuleNotFoundError: No module named 'scispacy'

## Definition of Entities and Similar Terms (Entity Linking)

In [16]:
nlp = spacy.load("model-best")

nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

doc = nlp('''Full term well appearing infant with some facial jaundice , overall pink , warm and well perfused , alert and responsive .
Anterior fontanelle is soft , open and flat .
Ears are normally set .
Red reflexes noted bilaterally .
He had a left pupil that was in unequal in size with a keyhole appearance consistent with a coloboma .
The nares were patent .
Palate was intact .
Mucous membranes were moist and pink .
His neck was supple without masses or bruits .
Lungs were clear to auscultation and equal .
Comfortable respiratory pattern .
Cardiovascular :
Regular rate and rhythm , no murmur .
2 plus femoral pulses were noted .
Abdomens oft with positive bowel sounds .
Genitourinary :
Infant was circumcised in the newborn nursery which was healing well .
Testes were descended bilaterally .
Extremities were pink and well perfused .''')

# Let's look at a random entity!
entity = doc.ents[0]

linker = nlp.get_pipe("scispacy_linker")
for umls_ent in entity._.kb_ents:
	print(linker.kb.cui_to_entity[umls_ent[0]])

ValueError: [E002] Can't find factory for 'scispacy_linker' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a Transformer, make sure to install 'spacy-transformers'. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, ner, beam_ner, entity_ruler, tagger, morphologizer, senter, sentencizer, textcat, spancat, future_entity_ruler, span_ruler, textcat_multilabel, en.lemmatizer