# End to end process for adding both entity ruler and word vectors to an NER model
1. Document cleaning and splitting the corpus into test and train sets
2. Build word vectors
3. Build training data with entity ruler and split into train and validation data
4. Add word vectors to model, run

## Notebook 3
- Load training data
- Process to create training entities
    - Build entity ruler patterns
    - Build training data
- Convert to spaCy 3 format
    - Save training and validation data

In [1]:
#import, save data
import json

#build entity ruler
import spacy

#convert to training format
from spacy.tokens import DocBin

import srsly
import typer
import warnings
from pathlib import Path

### Load data

In [2]:
# import training data
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return(data)

In [3]:
files = load_data('data/sw_train_ner.json')

In [4]:
print(files[5:15])

['Corran asked', 'necessarily Luke realm possibility either', 'use dueling blasters ranged weapons', 'Usually Deputy Director led morning meetings', 'Wedge gave light slap arm', 'turned Leia', 'Yeah', 'finally caught', 'Kyp put shuttle Corran retrieved extra tool kit pulled shuttle left', '']


### Create entity ruler and add pipe

In [5]:
nlp = spacy.load('en_core_web_md')
#nlp.add_pipe('ner')
ruler = nlp.add_pipe('entity_ruler', before='ner')

In [6]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'entity_ruler', 'ner']


In [7]:
#Lists of entities and patterns
patterns = [
            {'label':'PERSON', 'pattern':'Jaina'},
            {'label':'PERSON', 'pattern':'Wedge'},
            {'label':'PERSON', 'pattern':'Wedge Antilles'},
            {'label':'PERSON', 'pattern':'Corran'},
            {'label':'PERSON', 'pattern':'Corran Horn'},
            {'label':'PERSON', 'pattern':'Iella'},
            {'label':'PERSON', 'pattern':'Iella Wessiri'},
            {'label':'PERSON', 'pattern':'Jacen'},
            {'label':'PERSON', 'pattern':"Brianna"},
            {'label':'PERSON', 'pattern':"Sirren Choth"},
            {'label':'PERSON', 'pattern':"Sirren"},
            {'label':'PERSON', 'pattern':"Sir ren"},
            {'label':'PERSON', 'pattern':'Anakin'},
            {'label':'PERSON', 'pattern':'Cami'},
            {'label':'PERSON', 'pattern':'Camie'},
            {'label':'PERSON', 'pattern':'Valah'},
            {'label':'PERSON', 'pattern':'Leia'},
            {'label':'PERSON', 'pattern':'Salta'},
            {'label':'PERSON', 'pattern':'Han'},
            {'label':'PERSON', 'pattern':'Kyp'},
            {'label':'PERSON', 'pattern':'Mirax'},
            {'label':'PERSON', 'pattern':'Cal'},
            {'label':'PERSON', 'pattern':'Hobbie'},
            {'label':'PERSON', 'pattern':'Mourtos'},
            {'label':'PERSON', 'pattern':'Winter'},
            {'label':'PERSON', 'pattern':'Booster'},
            {'label':'PERSON', 'pattern':'Visalia'},
            {'label':'PERSON', 'pattern':"Borsk Feylya"},
            {'label':'TITLE', 'pattern':"Director"},
            {'label':'TITLE', 'pattern':"Assistant Director"},
            {'label':'TITLE', 'pattern':"Deputy Assistant Director"},
            {'label':'TITLE', 'pattern':"Deputy"},
            {'label':'TITLE', 'pattern':"Lieutenant"},
            {'label':'TITLE', 'pattern':"General"},
            {'label':'TITLE', 'pattern':"Councilor"},
            {'label':'TITLE', 'pattern':"Moff"},
            {'label':'TITLE', 'pattern':"Commander"},
            {'label':'TITLE', 'pattern':"Captain"},
            {'label':'TITLE', 'pattern':"Chief"},
            {'label':'TITLE', 'pattern':"Assistant Chief"},
            {'label':'TITLE', 'pattern':"AD"},
            {'label':'TITLE', 'pattern':"Master"},
            {'label':'TITLE', 'pattern':"Master Jedi"},
            {'label':'LOC', 'pattern':'Coruscant'},
            {'label':'LOC', 'pattern':'Alderaan'},
            {'label':'LOC', 'pattern':'Yavin'},
            {'label':'LOC', 'pattern':'Jedah City'},
            {'label':'LOC', 'pattern':'Tatooine'},
            {'label':'LOC', 'pattern':'Formuth'},
            {'label':'ORG', 'pattern':'NRDI'},
            {'label':'ORG', 'pattern':'New Republic Directorate of Intelligence'},
            {'label':'ORG', 'pattern':'Empire'},
            {'label':'ORG', 'pattern':'Vir Shat'},
            {'label':'ORG', 'pattern':'Vir Azmun'},
            {'label':'ORG', 'pattern':'Sith'},
            {'label':'ORG', 'pattern':'Jedi'},
            {'label':'ORG', 'pattern':'Jedi Order'},
            {'label':'PERSON', 'pattern':"Vir Sun"},
            {'label':'LOC', 'pattern':'Charapath'},
            {'label':'LOC', 'pattern':'Xalos'},
            {'label':'LOC', 'pattern':'Dagobah'},
            {'label':'LOC', 'pattern':'Hoth'},
            {'label':'LOC', 'pattern':'Bakura'},
            {'label':'LOC', 'pattern':'Scarif'},
            {'label':'PERSON', 'pattern':'Tycho'},
            {'label':'FORCE', 'pattern':'Force'},
            {'label':'SPECIES', 'pattern':'Duros'},
            {'label':'SPECIES', 'pattern':'Yuzong Vong'},
            {'label':'SPECIES', 'pattern':'Utapaun'},
            {'label':'SPECIES', 'pattern':'Bothan'},
            {'label':'SPECIES', 'pattern':"Twilek"},
            {'label':'SPECIES', 'pattern':"Elomin"},
            {'label':'PRODUCT', 'pattern':'Holobook'},
            {'label':'PRODUCT', 'pattern':'HoloBook'},
            {'label':'PRODUCT', 'pattern':'HoloNet'},
            {'label':'FORM', 'pattern':'Soresu'},
            {'label':'FORM', 'pattern':'Makashi'},
            {'label':'DROID', 'pattern':'R2'},
            {'label':'DROID', 'pattern':'Whistler'},
            {'label':'DROID', 'pattern':'3PO'},
            {'label':'DROID', 'pattern':'Threepio'},
            {'label':'ORG', 'pattern':"Vir Teng"},
            {'label':'PERSON', 'pattern':'Dom'},
            {'label':'SHIP', 'pattern':'Falcon'},
            {'label':'SHIP', 'pattern':'X Wing'},
            {'label':'SHIP', 'pattern':'Y Wings'},
            {'label':'SHIP', 'pattern':'TIE fighter'},
            {'label':'SHIP', 'pattern':'Errant Venture'},
            {'label':'SHIP', 'pattern':'Pulsar Skate'},
            {'label':'PERSON', 'pattern':'Melki'},
            {'label':'FORM', 'pattern':'Form IV'},
            {'label':'FORM', 'pattern':'Form II'},
            {'label':'FORM', 'pattern':'Form III'}]

In [8]:
ruler.add_patterns(patterns)

### Put into spaCy format

In [9]:
TRAIN_DATA = []
for sentence in files:
    doc = nlp(sentence)
    entities = []
    for ent in doc.ents:
        entities.append([ent.start_char, ent.end_char, ent.label_])
    TRAIN_DATA.append([sentence, {'entities':entities}])

In [10]:
print(TRAIN_DATA[5:15])

[['Corran asked', {'entities': [[0, 6, 'PERSON']]}], ['necessarily Luke realm possibility either', {'entities': [[12, 16, 'PERSON']]}], ['use dueling blasters ranged weapons', {'entities': []}], ['Usually Deputy Director led morning meetings', {'entities': [[8, 14, 'TITLE'], [15, 23, 'TITLE'], [28, 35, 'TIME']]}], ['Wedge gave light slap arm', {'entities': [[0, 5, 'PERSON']]}], ['turned Leia', {'entities': [[7, 11, 'PERSON']]}], ['Yeah', {'entities': []}], ['finally caught', {'entities': []}], ['Kyp put shuttle Corran retrieved extra tool kit pulled shuttle left', {'entities': [[0, 3, 'PERSON'], [16, 22, 'PERSON']]}], ['', {'entities': []}]]


In [11]:
print(len(TRAIN_DATA))

16560


### Save training data as json

In [12]:
def save_data(file, data):
    with open(file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

In [13]:
save_data('./data/sw_training_data.json', TRAIN_DATA[:8280])
save_data('./data/sw_valid_data.json', TRAIN_DATA[8280:])

### Covert to spaCy 3 format

In [14]:
def load_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return(data)

In [15]:
train_data = load_data('./data/sw_training_data.json')
valid_data = load_data('./data/sw_valid_data.json')

In [16]:
def convert(lang:str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot['entities']:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)

In [17]:
convert('en', train_data, 'data/train.spacy')
convert('en', valid_data, 'data/valid.spacy')