In [2]:
# ! pip install -U spacy

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object


In [5]:
import json
f = open('drug_composition_annotations.json')
TRAIN_DATA = json.load(f)

In [6]:
for text, annot in tqdm(TRAIN_DATA['annotations']):
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 43/43 [00:00<00:00, 522.19it/s]


In [7]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [8]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     36.33    3.81    3.33    4.44    0.04
 16     200         18.63    864.05  100.00  100.00  100.00    1.00
 36     400          0.83      0.66  100.00  100.00  100.00    1.00
 61     600          0.01      0.01  100.00  100.00  100.00    1.00
 92     800          0.00      0.00  100.00  100.00  100.00    1.00
131    1000          0.00      0.00  100.00  100.00  100.00    1.00
178    1200          0.00      0.00  100.00  100.00  100.00    1.00
239    1400          0.00      0.00  100.00  100.00  100.00    1.00
306    1600          0.00      0.00  100.00  100.00  100.00    1.00
400    1800          0.00      0.00  100.00  100.00

In [9]:
nlp_ner = spacy.load("/content/model-best")

In [10]:
doc = nlp_ner("30 capsules Cipla Rx Formotero Fumarate and Budesonide Powder For Inhalation IP foracort rotacaps 200")

In [11]:

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [12]:
!zip -r /content/model-best.zip /content/model-best/

  adding: content/model-best/ (stored 0%)
  adding: content/model-best/config.cfg (deflated 61%)
  adding: content/model-best/tok2vec/ (stored 0%)
  adding: content/model-best/tok2vec/model (deflated 8%)
  adding: content/model-best/tok2vec/cfg (stored 0%)
  adding: content/model-best/tokenizer (deflated 81%)
  adding: content/model-best/vocab/ (stored 0%)
  adding: content/model-best/vocab/strings.json (deflated 71%)
  adding: content/model-best/vocab/lookups.bin (stored 0%)
  adding: content/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/model-best/vocab/key2row (stored 0%)
  adding: content/model-best/vocab/vectors (deflated 45%)
  adding: content/model-best/meta.json (deflated 58%)
  adding: content/model-best/ner/ (stored 0%)
  adding: content/model-best/ner/model (deflated 7%)
  adding: content/model-best/ner/cfg (deflated 33%)
  adding: content/model-best/ner/moves (deflated 38%)
