# Creating the NER Model with spaCy v3

In [1]:
import spacy
from spacy import displacy
from spacy.tokens import DocBin
from tqdm import tqdm

### Convert the annotated data into the spaCy bin object

In [2]:
def process_sentence(sentence_content, sentence_counter):
    #print('Processing sentence: %d' % sentence_counter)
    index = 0
    entities = []
    sentence_text = ''
    for token, tag in sentence_content:
        if tag in {'B-COMMENT', 'I-COMMENT', 'OTHER'}:  # Skip these tags
            tag = 'O'
        if tag in {'B-RANGE_END', 'B-INDEX'}:
            tag = 'QTY'
        tag = tag.replace('B-', '')
        tag = tag.replace('I-', '')
        tag = tag.replace('NAME', 'INGREDIENT')
        tag = tag.replace('QTY', 'QUANTITY')
        
        if tag != 'O':
            entity_info = [index, index+len(token), tag]
            entities.append(entity_info)
        sentence_text += token + ' '
        index += len(token)+1  # Plus 1 becuase of the empty space
    
    sentence_data = [sentence_text.rstrip(), {'entities': entities}]

    return sentence_data

def read_nyt_dataset():
    dataset = []
    sentence_content = []
    sentence_counter = 1
    
    with open('resource/dataset_nyt') as fin:
        file_lines = fin.readlines()
    
    for file_line in file_lines:
        file_line = file_line.strip()

        if len(file_line) > 0:  
            items = file_line.split('\t')
            token, tag = items[0], items[5]
            sentence_content.append((token, tag))
        else:# End of the phrase
            sentence_data = process_sentence(sentence_content, sentence_counter)
            if len (sentence_data[1]['entities']) > 0:
                dataset.append(sentence_data)
            sentence_content = []
            sentence_counter += 1
    
    return dataset

def build_bin_object(dataset, set_name):
    nlp = spacy.blank('en') # load a new spacy model
    db = DocBin() # create a DocBin object
    for text, annot in tqdm(dataset): # data in previous format
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot['entities']: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode='contract')
            if span is None:
                print('Skipping entity')
            else:
                ents.append(span)
        try:
            doc.ents = ents # label the text with the ents
            db.add(doc)
        except:
            print(text, annot)
    db.to_disk('resource/%s_data.spacy' % set_name) # save the docbin object

In [3]:
dataset = read_nyt_dataset()
split_index = int(len(dataset) * 0.70)  # Use 70% and 30% splits
train_data = dataset[:split_index]
dev_data = dataset[split_index:]
build_bin_object(train_data, 'train')
build_bin_object(dev_data, 'dev')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 117140/117140 [00:55<00:00, 2104.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50204/50204 [00:23<00:00, 2108.39it/s]


### Generate the config file to train via command line

In [4]:
# Download the base_config.cfg file from here: https://spacy.io/usage/training

In [5]:
! python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Training the model using the command line

In [6]:
!python -m spacy train config.cfg --output ./resource/model_ner/ --paths.train ./resource/train_data.spacy --paths.dev ./resource/dev_data.spacy --training.eval_frequency 10 --training.max_steps 100

[38;5;4mℹ Saving to output directory: resource/model_ner[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-06-30 14:16:23,956] [INFO] Set up nlp object from config
[2022-06-30 14:16:23,967] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-30 14:16:23,971] [INFO] Created vocabulary
[2022-06-30 14:16:23,972] [INFO] Finished initializing nlp object
[2022-06-30 14:17:52,616] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     71.00   15.88   12.18   22.81    0.16
  0      10          0.62    806.83   31.16   84.77   19.09    0.31
  0      20          1.91    530.29   39.61   84.38   25.88    0.40
  0      30          3.58    402.93   72.05   66.59   78.49    0.72
  0      40          7.28    277.56   75.65   

### Load and test the model

In [7]:
nlp = spacy.load('resource/model_ner/model-last/')

In [8]:
colors = {'QUANTITY': 'yellow', 'UNIT': 'green', 'INGREDIENT': 'orange'}
options = {'ents': ['QUANTITY', 'UNIT', 'INGREDIENT'], 'colors':colors}

In [9]:
doc = nlp('2 cups cherry tomatoes, sliced into quarters')
displacy.render(doc, style='ent', jupyter=True, options=options)