<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/NER/Custom%20NER%20Training%20using%20spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!python -m spacy download en_core_web_sm

### Importing Dependencies

In [2]:
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

### Training Data

In [4]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

### Defining the model

In [5]:
model = None
output_dir=Path("ner")
n_iter=100

### Load the model


In [6]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


###Set up the pipeline


In [7]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

In [9]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Only train NER

In [13]:
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 34.92it/s]


{'ner': 12.459059476852417}


100%|██████████| 3/3 [00:00<00:00, 45.28it/s]


{'ner': 12.864447057247162}


100%|██████████| 3/3 [00:00<00:00, 40.29it/s]


{'ner': 12.389938294887543}


100%|██████████| 3/3 [00:00<00:00, 42.81it/s]


{'ner': 11.03943046927452}


100%|██████████| 3/3 [00:00<00:00, 43.60it/s]


{'ner': 9.111038595438004}


100%|██████████| 3/3 [00:00<00:00, 46.21it/s]


{'ner': 8.747764304280281}


100%|██████████| 3/3 [00:00<00:00, 44.31it/s]


{'ner': 7.09586388617754}


100%|██████████| 3/3 [00:00<00:00, 43.35it/s]


{'ner': 6.966649919748306}


100%|██████████| 3/3 [00:00<00:00, 45.18it/s]


{'ner': 7.659058164805174}


100%|██████████| 3/3 [00:00<00:00, 41.41it/s]


{'ner': 6.075723123256466}


100%|██████████| 3/3 [00:00<00:00, 43.14it/s]


{'ner': 5.698748848750256}


100%|██████████| 3/3 [00:00<00:00, 45.47it/s]


{'ner': 5.330300049914513}


100%|██████████| 3/3 [00:00<00:00, 43.58it/s]


{'ner': 4.552661396563053}


100%|██████████| 3/3 [00:00<00:00, 44.79it/s]


{'ner': 5.255003970363759}


100%|██████████| 3/3 [00:00<00:00, 46.40it/s]


{'ner': 3.992736534681171}


100%|██████████| 3/3 [00:00<00:00, 44.56it/s]


{'ner': 5.744023977706092}


100%|██████████| 3/3 [00:00<00:00, 39.80it/s]


{'ner': 5.207886075178976}


100%|██████████| 3/3 [00:00<00:00, 44.55it/s]


{'ner': 5.152956927930063}


100%|██████████| 3/3 [00:00<00:00, 43.47it/s]


{'ner': 5.199962692233385}


100%|██████████| 3/3 [00:00<00:00, 42.71it/s]


{'ner': 4.374559223477263}


100%|██████████| 3/3 [00:00<00:00, 48.32it/s]


{'ner': 3.2482410317297763}


100%|██████████| 3/3 [00:00<00:00, 46.77it/s]


{'ner': 2.1909197910940748}


100%|██████████| 3/3 [00:00<00:00, 43.34it/s]


{'ner': 3.0264390334169065}


100%|██████████| 3/3 [00:00<00:00, 49.37it/s]


{'ner': 1.7819746466566357}


100%|██████████| 3/3 [00:00<00:00, 48.63it/s]


{'ner': 1.1148077007217125}


100%|██████████| 3/3 [00:00<00:00, 44.61it/s]


{'ner': 2.7756813469850385}


100%|██████████| 3/3 [00:00<00:00, 44.93it/s]


{'ner': 1.158229610371738}


100%|██████████| 3/3 [00:00<00:00, 44.99it/s]


{'ner': 0.9009806501824116}


100%|██████████| 3/3 [00:00<00:00, 49.01it/s]


{'ner': 1.8386583729641799}


100%|██████████| 3/3 [00:00<00:00, 40.52it/s]


{'ner': 1.2245709349979825}


100%|██████████| 3/3 [00:00<00:00, 47.53it/s]


{'ner': 2.528677051971465}


100%|██████████| 3/3 [00:00<00:00, 43.76it/s]


{'ner': 2.1986149271555653}


100%|██████████| 3/3 [00:00<00:00, 44.54it/s]


{'ner': 2.1101778259604647}


100%|██████████| 3/3 [00:00<00:00, 43.82it/s]


{'ner': 1.8885970228352336}


100%|██████████| 3/3 [00:00<00:00, 47.16it/s]


{'ner': 0.7007456223516765}


100%|██████████| 3/3 [00:00<00:00, 45.63it/s]


{'ner': 1.549273075243576}


100%|██████████| 3/3 [00:00<00:00, 44.62it/s]


{'ner': 1.4461231645628563}


100%|██████████| 3/3 [00:00<00:00, 47.54it/s]


{'ner': 1.1268729110094302}


100%|██████████| 3/3 [00:00<00:00, 43.80it/s]


{'ner': 1.3396617030521296}


100%|██████████| 3/3 [00:00<00:00, 45.90it/s]


{'ner': 0.6872039103821371}


100%|██████████| 3/3 [00:00<00:00, 41.37it/s]


{'ner': 1.1076891415540644}


100%|██████████| 3/3 [00:00<00:00, 47.87it/s]


{'ner': 1.7823152677773435}


100%|██████████| 3/3 [00:00<00:00, 44.00it/s]


{'ner': 0.7585717533509805}


100%|██████████| 3/3 [00:00<00:00, 46.03it/s]


{'ner': 0.5379036120198873}


100%|██████████| 3/3 [00:00<00:00, 44.79it/s]


{'ner': 0.6334580524455796}


100%|██████████| 3/3 [00:00<00:00, 46.51it/s]


{'ner': 0.7573269131349646}


100%|██████████| 3/3 [00:00<00:00, 47.55it/s]


{'ner': 0.2645762577046633}


100%|██████████| 3/3 [00:00<00:00, 47.18it/s]


{'ner': 0.05421632113075252}


100%|██████████| 3/3 [00:00<00:00, 42.44it/s]


{'ner': 0.36788513023620867}


100%|██████████| 3/3 [00:00<00:00, 43.85it/s]


{'ner': 2.2787724717511475}


100%|██████████| 3/3 [00:00<00:00, 46.33it/s]


{'ner': 0.1070071623521762}


100%|██████████| 3/3 [00:00<00:00, 46.71it/s]


{'ner': 0.0008882873583758621}


100%|██████████| 3/3 [00:00<00:00, 47.26it/s]


{'ner': 0.20235459136926948}


100%|██████████| 3/3 [00:00<00:00, 44.17it/s]


{'ner': 0.01809808412992518}


100%|██████████| 3/3 [00:00<00:00, 48.33it/s]


{'ner': 3.996483820106034e-06}


100%|██████████| 3/3 [00:00<00:00, 43.60it/s]


{'ner': 0.055561630702287375}


100%|██████████| 3/3 [00:00<00:00, 42.68it/s]


{'ner': 0.0004022726877014526}


100%|██████████| 3/3 [00:00<00:00, 44.81it/s]


{'ner': 0.009504676482010654}


100%|██████████| 3/3 [00:00<00:00, 42.78it/s]


{'ner': 0.00018055049179536205}


100%|██████████| 3/3 [00:00<00:00, 45.90it/s]


{'ner': 0.0001263230774520661}


100%|██████████| 3/3 [00:00<00:00, 45.74it/s]


{'ner': 0.000246636192149991}


100%|██████████| 3/3 [00:00<00:00, 44.91it/s]


{'ner': 4.447248215348692e-05}


100%|██████████| 3/3 [00:00<00:00, 50.06it/s]


{'ner': 4.138879120547683e-05}


100%|██████████| 3/3 [00:00<00:00, 40.78it/s]


{'ner': 5.519166033968765e-07}


100%|██████████| 3/3 [00:00<00:00, 42.85it/s]


{'ner': 5.3348466028675265e-06}


100%|██████████| 3/3 [00:00<00:00, 46.09it/s]


{'ner': 1.1274112901290431e-05}


100%|██████████| 3/3 [00:00<00:00, 38.18it/s]


{'ner': 3.0492908101577322e-05}


100%|██████████| 3/3 [00:00<00:00, 40.70it/s]


{'ner': 0.07918139193721437}


100%|██████████| 3/3 [00:00<00:00, 39.32it/s]


{'ner': 9.444963879477198e-05}


100%|██████████| 3/3 [00:00<00:00, 44.95it/s]


{'ner': 7.544871751821616e-05}


100%|██████████| 3/3 [00:00<00:00, 42.64it/s]


{'ner': 1.9714275953651827e-06}


100%|██████████| 3/3 [00:00<00:00, 44.08it/s]


{'ner': 0.00016121468537869937}


100%|██████████| 3/3 [00:00<00:00, 44.90it/s]


{'ner': 2.766418953022747e-07}


100%|██████████| 3/3 [00:00<00:00, 47.42it/s]


{'ner': 9.713602962467116e-09}


100%|██████████| 3/3 [00:00<00:00, 49.39it/s]


{'ner': 0.001755485678149882}


100%|██████████| 3/3 [00:00<00:00, 46.14it/s]


{'ner': 2.3235307286860977e-07}


100%|██████████| 3/3 [00:00<00:00, 46.79it/s]


{'ner': 4.484765745304603e-05}


100%|██████████| 3/3 [00:00<00:00, 48.90it/s]


{'ner': 1.0686794089112089e-05}


100%|██████████| 3/3 [00:00<00:00, 48.31it/s]


{'ner': 5.235076944916366e-08}


100%|██████████| 3/3 [00:00<00:00, 45.22it/s]


{'ner': 3.384810934597411e-05}


100%|██████████| 3/3 [00:00<00:00, 46.96it/s]


{'ner': 5.499873244029382e-09}


100%|██████████| 3/3 [00:00<00:00, 42.35it/s]


{'ner': 0.00034106235065564053}


100%|██████████| 3/3 [00:00<00:00, 45.14it/s]


{'ner': 6.619892691239999e-07}


100%|██████████| 3/3 [00:00<00:00, 44.13it/s]


{'ner': 8.791311572972432e-06}


100%|██████████| 3/3 [00:00<00:00, 44.41it/s]


{'ner': 0.007598231210188752}


100%|██████████| 3/3 [00:00<00:00, 42.27it/s]


{'ner': 0.0007249180359227885}


100%|██████████| 3/3 [00:00<00:00, 47.04it/s]


{'ner': 4.135028297670267e-05}


100%|██████████| 3/3 [00:00<00:00, 45.26it/s]


{'ner': 0.00026251461326294904}


100%|██████████| 3/3 [00:00<00:00, 46.15it/s]


{'ner': 7.491690861916336e-07}


100%|██████████| 3/3 [00:00<00:00, 46.73it/s]


{'ner': 0.00012795886441086256}


100%|██████████| 3/3 [00:00<00:00, 45.00it/s]


{'ner': 9.129881818616502e-08}


100%|██████████| 3/3 [00:00<00:00, 45.13it/s]


{'ner': 1.2266744162087177e-08}


100%|██████████| 3/3 [00:00<00:00, 46.25it/s]


{'ner': 5.65422685706828e-07}


100%|██████████| 3/3 [00:00<00:00, 47.44it/s]


{'ner': 7.041047071739912e-08}


100%|██████████| 3/3 [00:00<00:00, 44.45it/s]


{'ner': 2.8471942269392474e-06}


100%|██████████| 3/3 [00:00<00:00, 45.46it/s]


{'ner': 8.059800355603257e-08}


100%|██████████| 3/3 [00:00<00:00, 42.43it/s]


{'ner': 8.354351850288891e-07}


100%|██████████| 3/3 [00:00<00:00, 37.81it/s]


{'ner': 1.009573677952168e-06}


100%|██████████| 3/3 [00:00<00:00, 38.01it/s]


{'ner': 1.6811590894378839e-06}


100%|██████████| 3/3 [00:00<00:00, 39.83it/s]

{'ner': 0.023270564207550148}





In [14]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Kamal Khumar', 'PERSON')]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Entities [('Nishanth', 'PERSON')]


### Saving the spaCy model

In [15]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ner


In [16]:
m = spacy.load('/content/ner')