In [48]:
import json
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
from tqdm import tqdm
from spacy import displacy

In [35]:
nlp = spacy.blank('en')

In [36]:
with open("training_data.json") as f:
    dataset = json.load(f)

In [37]:
NER_Training_data = []
for example in dataset["annotations"]:
    text = example[0]
    temp_dict = {}
    temp_dict["text"] = text
    entity = []
    for ann in example[1]["entities"]:
        start = ann[0]
        end = ann[1]
        label = ann[2]
        entity.append((start, end, label))
    temp_dict["entity"] = entity
    NER_Training_data.append(temp_dict)

In [38]:
NER_Training_data[0]

{'text': 'Cryptocurrency prices today surged with Bitcoin trading at $48,089.82, a 2.6% increase in the last 24 hours. Ethereum surged marginally to $3,838.45, a 1.3% increase in the last 24 hours.',
 'entity': [(40, 47, 'CRYPTO'),
  (59, 69, 'VALUE'),
  (73, 77, 'PERCENTAGE'),
  (109, 117, 'CRYPTO'),
  (139, 148, 'VALUE'),
  (152, 156, 'PERCENTAGE')]}

In [39]:
doc_bin = DocBin()

In [40]:
for i in tqdm(NER_Training_data):
    doc = nlp.make_doc(i["text"])
    ents = []
    for start, end, label in i["entity"]:
        span = doc.char_span(start_idx=start, end_idx=end, label=label)
        if span is None:
            print("skipping span")
        else:
            ents.append(span)
            
    filtered_span = filter_spans(ents)
    doc.ents = filtered_span
    doc_bin.add(doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<?, ?it/s]


In [41]:
doc_bin.to_disk("train.spacy")

In [42]:
# https://spacy.io/usage/training#quickstart

In [43]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m[+] Auto-filled config with all values[0m
[38;5;2m[+] Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [45]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy 

[38;5;4m[i] Saving to output directory: .[0m
[38;5;4m[i] Using CPU[0m
[1m
[38;5;2m[+] Initialized pipeline[0m
[1m
[38;5;4m[i] Pipeline: ['tok2vec', 'ner'][0m
[38;5;4m[i] Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     45.05    3.70    2.78    5.56    0.04
 78     200        443.16   1255.92  100.00  100.00  100.00    1.00
176     400          0.00      0.00  100.00  100.00  100.00    1.00
276     600          0.00      0.00  100.00  100.00  100.00    1.00
459     800          0.00      0.00  100.00  100.00  100.00    1.00
659    1000          0.00      0.00  100.00  100.00  100.00    1.00
859    1200          0.00      0.00  100.00  100.00  100.00    1.00
1059    1400          0.00      0.00  100.00  100.00  100.00    1.00
1259    1600          0.00      0.00  100.00  100.00  100.00    1.00
1459    1800          0.00      0.00  1

[2023-07-28 11:49:52,350] [INFO] Set up nlp object from config
[2023-07-28 11:49:52,359] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-07-28 11:49:52,362] [INFO] Created vocabulary
[2023-07-28 11:49:52,362] [INFO] Finished initializing nlp object
[2023-07-28 11:49:52,455] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [46]:
custom_ner = spacy.load("model-best")

In [47]:
doc = custom_ner('Cryptocurrency prices today surged with Bitcoin trading at $48,089.82, a 2.6% increase in the last 24 hours. Ethereum surged marginally to $3,838.45, a 1.3% increase in the last 24 hours.')

In [49]:
displacy.render(doc, style="ent", jupyter=True)