# Creating the NER Model with spaCy v2

In [10]:
from spacy import displacy

### Convert the annotated data into the spaCy format

In [35]:
def process_sentence(sentence_content, sentence_counter):
    sentence_id = 'Sentence: %d' % sentence_counter
    #print('Processing', sentence_id)
    index = 0
    entities = []
    sentence_text = ''
    for token, tag in sentence_content:
        if tag in {'B-COMMENT', 'I-COMMENT', 'OTHER'}:  # Skip these tags
            tag = 'O'
        if tag in {'B-RANGE_END', 'B-INDEX'}:
            tag = 'QTY'
        tag = tag.replace('B-', '')
        tag = tag.replace('I-', '')
        
        if tag != 'O':
            entity = [index, index+len(token), tag]
            entities.append(entity)
        sentence_text += token + ' '
        index += len(token)+1  # Plus 1 becuase of the empty space
    
    sentence_data = [sentence_text.rstrip(), {'entities': entities}]

    return sentence_data

def read_nyt_dataset():
    dataset = []
    sentence_content = []
    sentence_counter = 1
    
    with open('resource/dataset_nyt') as fin:
        file_lines = fin.readlines()
    
    for file_line in file_lines:
        file_line = file_line.strip()

        if len(file_line) > 0:  
            items = file_line.split('\t')
            token, tag = items[0], items[5]
            sentence_content.append((token, tag))
        else:# End of the phrase
            sentence_data = process_sentence(sentence_content, sentence_counter)
            if len (sentence_data[1]['entities']) > 0:
                dataset.append(sentence_data)
            sentence_content = []
            sentence_counter += 1
    
    return dataset

In [36]:
TRAIN_DATA = read_nyt_dataset()

In [None]:
import spacy
import random


def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = 'new_spacy'
prdnlp.to_disk(modelfile)

#Test your text
test_text = '1 cup peeled and cooked fresh chestnuts'
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [38]:
colors = {'QTY': 'yellow', 'UNIT': 'green', 'NAME': 'orange'}
options = {'ents': ['QTY', 'UNIT', 'NAME'], 'colors':colors}

In [39]:
doc = prdnlp('25 grams whole coffee beans')

displacy.render(doc, style='ent', jupyter=True, options=options)

In [41]:
doc = prdnlp('Preheat the oven to 350 F. Butter or oil an 8-inch baking dish.')

displacy.render(doc, style='ent', jupyter=True, options=options)