In [1]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin()

In [2]:
import json
f = open('annotation.json')
TRAIN_DATA = json.load(f)

In [3]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 22/22 [00:00<00:00, 122.26it/s]

Skipping entity





In [5]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

⚠ To generate a more effective transformer-based config (GPU-only), install the
spacy-transformers package and re-run this command. The config generated now
does not use transformers.
ℹ Generated config template specific for your use case
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [6]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

^C


[2024-05-21 11:58:25,548] [INFO] Set up nlp object from config
[2024-05-21 11:58:25,564] [INFO] Pipeline: ['tok2vec', 'ner']
[2024-05-21 11:58:25,564] [INFO] Created vocabulary
[2024-05-21 11:58:25,572] [INFO] Finished initializing nlp object
[2024-05-21 11:58:26,435] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


ℹ Saving to output directory: .
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    118.54    0.00    0.00    0.00    0.00
 16     200      23697.75  12134.09   31.00   34.71   28.00    0.31
 33     400       2048.93   2009.42   90.55   88.54   92.67    0.91
 50     600        557.28    724.14   94.87   91.36   98.67    0.95
 66     800        610.92    616.75   93.92   95.21   92.67    0.94
 83    1000        557.35    543.24   95.59   97.24   94.00    0.96
100    1200        445.39    478.79   94.85   97.87   92.00    0.95
116    1400        376.99    447.46   95.15   92.45   98.00    0.95
133    1600        544.62    459.80   96.13   93.12   99.33    0.96
150    1800        459.27    457.18   95.21   97.89   92.67    0.95
166    2000        402.44    414.48   96.08   94

In [8]:
nlp_ner = spacy.load("model-best")

In [9]:
doc=nlp_ner("SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,Lungs:  Clear.,ASSESSMENT:,  Allergic rhinitis.,PLAN:,1.  She will try Zyrtec instead of Allegra again.  Another option will be to use loratadine.  She does not think she has prescription coverage so that might be cheaper.,2.  Samples of Nasonex two sprays in each nostril given for three weeks.  A prescription was written as well.")

In [10]:
entities=[]
for entity in doc.ents:
  entities.append({"Entity": entity.label_,"Text": entity.text})
print(entities)

[{'Entity': 'AGE', 'Text': '23-year-old'}, {'Entity': 'GENDER', 'Text': 'female'}, {'Entity': 'SYMPTOM', 'Text': 'allergies.'}, {'Entity': 'SYMPTOM', 'Text': 'allergies'}, {'Entity': 'MEDICINE', 'Text': 'Claritin'}, {'Entity': 'MEDICINE', 'Text': 'Zyrtec.'}, {'Entity': 'MEDICINE', 'Text': 'Allegra'}, {'Entity': 'DISEASE', 'Text': 'asthma'}, {'Entity': 'MEDICINE', 'Text': 'Ortho Tri-Cyclen'}, {'Entity': 'WEIGHT', 'Text': '130 pounds'}, {'Entity': 'SYMPTOM', 'Text': 'erythematous'}, {'Entity': 'DISEASE', 'Text': 'Allergic rhinitis.'}, {'Entity': 'MEDICINE', 'Text': 'loratadine.'}, {'Entity': 'MEDICINE', 'Text': 'Nasonex'}, {'Entity': 'DOSES', 'Text': 'two sprays'}]


In [11]:
import pandas as pd

# Initialize a dictionary to hold the data for the DataFrame
data = {}

# Process each entity in the extracted entities list
for entity in entities:
    entity_type = entity['Entity']
    entity_value = entity['Text']

    # If the entity type is not already in the data, add it
    if entity_type not in data:
        data[entity_type] = []

    # Append the entity value to the corresponding list in the data
    data[entity_type].append(entity_value)

# Determine the maximum length of the lists to balance the DataFrame
max_length = max(len(values) for values in data.values())

# Normalize the lengths of the lists in the data dictionary
for key in data:
    if len(data[key]) < max_length:
        data[key].extend([None] * (max_length - len(data[key])))

# Convert the data dictionary to a Pandas DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
print(df)

# Optionally, save the DataFrame to a CSV file
df.to_csv('dynamic_structured_data.csv', index=False)

           AGE  GENDER       SYMPTOM          MEDICINE             DISEASE  \
0  23-year-old  female    allergies.          Claritin              asthma   
1         None    None     allergies           Zyrtec.  Allergic rhinitis.   
2         None    None  erythematous           Allegra                None   
3         None    None          None  Ortho Tri-Cyclen                None   
4         None    None          None       loratadine.                None   
5         None    None          None           Nasonex                None   

       WEIGHT       DOSES  
0  130 pounds  two sprays  
1        None        None  
2        None        None  
3        None        None  
4        None        None  
5        None        None  


In [12]:
colors={"Age=":"#F67DE3","Gender":"#e6194B","Medicine":"#3cb44b"}
options={"colors":colors}
spacy.displacy.render(doc, style="ent",options=options, jupyter=True) # display in Jupyter