In [140]:
import spacy
from spacy.tokens import DocBin
nlp = spacy.load("en_core_web_sm")
db = DocBin()

In [141]:
!python -m spacy init fill-config ./base_config.cfg ./config.cfg

[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [142]:
import json

training_data = json.load(open("annotations.json"))

for text,annot in training_data['annotations']:
    doc = nlp.make_doc(text)
    ents = []
    for start,end,label in annot['entities']:
        span = doc.char_span(start,end,label=label,alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
        
    doc.ents = ents
    db.add(doc)
    
db.to_disk("./train.spacy")

In [143]:

test_data = json.load(open("test_anno.json"))

for text,annot in test_data['annotations']:
    doc = nlp.make_doc(text)
    ents = []
    for start,end,label in annot['entities']:
        span = doc.char_span(start,end,label=label,alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
        
    doc.ents = ents
    db.add(doc)
    
db.to_disk("./test.spacy")

In [144]:
!python -m spacy train ./config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy 


[i] Saving to output directory: output
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['tok2vec', 'ner']
[i] Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     25.86   14.49    8.62   45.45    0.14
200     200          0.22    273.91  100.00  100.00  100.00    1.00
400     400          0.00      0.00  100.00  100.00  100.00    1.00
600     600          0.00      0.00  100.00  100.00  100.00    1.00
800     800          0.00      0.00  100.00  100.00  100.00    1.00
1000    1000          0.00      0.00  100.00  100.00  100.00    1.00
1200    1200          0.00      0.00  100.00  100.00  100.00    1.00
1400    1400          0.00      0.00  100.00  100.00  100.00    1.00
1600    1600          0.00      0.00  100.00  100.00  100.00    1.00
1800    1800          0.00      0.00  100.00  100.00  100.00    1.00
[+] Saved pipeline to output

[2023-01-25 16:13:47,989] [INFO] Set up nlp object from config
[2023-01-25 16:13:47,998] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-01-25 16:13:48,001] [INFO] Created vocabulary
[2023-01-25 16:13:48,451] [INFO] Added vectors: en_core_web_sm
[2023-01-25 16:13:48,452] [INFO] Finished initializing nlp object
[2023-01-25 16:13:48,669] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [149]:
best = spacy.load(r"./output/model-best") #load the best model

In [150]:

doc = best("Apple has a great new product called Homepod")
spacy.displacy.render(doc, style="ent", jupyter=True)


In [151]:
doc = best("Steam has a new console SteamDeck which can run Counter Strike too")
spacy.displacy.render(doc, style="ent", jupyter=True)

In [152]:
doc = best("Nintendo releases Switch! but can it compete with SteamDeck?")
spacy.displacy.render(doc, style="ent", jupyter=True)