In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

In [17]:
text = "What video sharing service did Steve Chen, Chad Hurley, and Jawed Karim create in 2005 at America?"

import spacy

nlp = spacy.load("en_core_web_lg")
doc = nlp(text)
 
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [18]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [20]:
doc = nlp("Donald Trump was President of USA")
displacy.render(doc, style="ent", jupyter=True)
doc.ents


(Donald Trump, USA)

In [22]:
import json

with open('dataset/Corona2.json', 'r') as f:
    data = json.load(f)

In [None]:
training_data = []
for example in data['examples']:
    temp_dict = {}
    temp_dict['text'] = example['content']
    temp_dict['entities'] = []
    for annotation in example['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data.append(temp_dict)
print(training_data)

In [None]:
 from spacy.tokens import DocBin
 from tqdm import tqdm

 nlp = spacy.blank("en")
 doc_bin = DocBin()

 from spacy.util import filter_spans

 for training_example in tqdm(training_data):
     text = training_example['text']
     labels = training_example['entities']
     doc = nlp.make_doc(text)
     ents = []
     for start, end, label in labels:
         span = doc.char_span(start, end, label=label, alignment_mode="contract")
         if span is None:
             print("Skipping entity")
         else:
             ents.append(span)
     filtered_ents = filter_spans(ents)
     doc.ents = filtered_ents
     doc_bin.add(doc)

 doc_bin.to_disk("train.spacy")


In [37]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [40]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy 



[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    150.79    0.00    0.00    0.00    0.00
  7     200       1451.58   3327.30   74.34   73.90   74.80    0.74
 14     400        199.31    626.17   95.37   94.42   96.34    0.95
 22     600        290.69    297.50   97.76   97.96   97.56    0.98
 30     800        181.95    199.53   97.15   97.15   97.15    0.97
 40    1000        181.19    171.98   97.58   96.80   98.37    0.98
 52    1200        400.09    232.61   98.37   98.37   98.37    0.98
 65    1400        157.04    176.06   98.37   98.37   98.37    0.98
 82    1600        166.95    187.97   98.58   98.38   98.78    0.99
103    1800        227.75    221.20   98.37   98.77

In [39]:
nlp_ner = spacy.load("model-best")
doc = nlp_ner("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.")

colors = {"PATHOGEN": "#F67DE3", "MEDICINE": "#7DF6D9", "MEDICALCONDITION": "#a6e22d"}
options = {"colors": colors}

spacy.displacy.render(doc, style="ent", options=options, jupyter=True)
