In [14]:
import spacy
from spacy import displacy

# When Tensorflow is installed, it informs you that it can use the AVX operations and 
# will use them by default in certain situations (e.g. forward or back-prop matrix multiply),
# which can enhance the speed of the process. 
# This is not an error but rather a notification that Tensorflow is utilizing your CPU 
# to its fullest potential to improve performance.

text = "Blinken and Lavrov meet on G-20 sidelines in the first meeting in New Delhi between the top diplomats since the Ukraine war began."

for model_name in ['en_core_web_sm', 'en_core_web_md']:

    print(f'### {model_name} ###')
    
    # Load the model
    nlp = spacy.load(model_name)
    doc = nlp(text)

    # The different parts of the pipeline
    print(nlp.pipe_names)
    
    # Display the NERs
    displacy.render(doc, style="ent", jupyter=True)

### en_core_web_sm ###
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


### en_core_web_md ###
['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [15]:
# The standard cases are quite good but to not fit to all usecases.
# For example one might be interested in detecting entities related to diseases.

string = "Antiretroviral therapy ( ART ) is recommended for all HIV-infected individuals"
doc = nlp(string)
displacy.render(doc, style="ent", jupyter=True)

In [24]:
import json
# Download data and rename to diseases.json: https://www.kaggle.com/finalepoch/medical-ner
with open('../../data/Example/diseases.json', 'r') as f:
    data = json.load(f)

training_data = {'classes' : ['MEDICINE', "MEDICALCONDITION", "PATHOGEN"],
                 'annotations' : []}
# turning the data into a format usable for spacy.
for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)
        
example_annotations = training_data['annotations'][0]

print(json.dumps(example_annotations, indent=2))

{
  "text": "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
  "entities": [
    [
      360,
      371,
      "MEDICINE"
    ],
    [
      383,
      408,

In [53]:
from spacy.util import filter_spans
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

for training_example  in tqdm(training_data['annotations']):
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("./data/training_data.spacy") # save the doc bin object

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 295/295 [00:00<00:00, 1055.16it/s]


In [54]:
# The base config can be set via https://spacy.io/usage/training#quickstart
# Only select ner, CPU, efficiency.
!python -m spacy init fill-config ./conf/base_config.cfg ./conf/config.cfg

2023-03-03 11:02:44.233351: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
conf/config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [56]:
!python -m spacy train ./conf/config.cfg --output ./models --paths.train ./data/training_data.spacy --paths.dev ./data/training_data.spacy --gpu-id 0

2023-03-03 11:06:52.072789: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[38;5;4mℹ Saving to output directory: models[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-03-03 11:06:56,790] [INFO] Set up nlp object from config
[2023-03-03 11:06:56,802] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-03-03 11:06:56,806] [INFO] Created vocabulary
[2023-03-03 11:06:56,807] [INFO] Finished initializing nlp object
[2023-03-03 11:06:57,530] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  

In [57]:
nlp_ner = spacy.load("./models/model-best")

doc = nlp_ner("Antiretroviral therapy (ART) is recommended for all HIV-infected\
individuals to reduce the risk of disease progression.\nART also is recommended \
for HIV-infected individuals for the prevention of transmission of HIV.\nPatients \
starting ART should be willing and able to commit to treatment and understand the\
benefits and risks of therapy and the importance of adherence. Patients may choose\
to postpone therapy, and providers, on a case-by-case basis, may elect to defer\
therapy on the basis of clinical and/or psychosocial factors.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION":"#FFFFFF"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)