<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/NER/Custom%20NER%20Training%20using%20spaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!python -m spacy download en_core_web_sm

### Importing Dependencies

In [2]:
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

### Training Data

In [4]:
TRAIN_DATA = [
    ('Who is Nishanth?', {
        'entities': [(7, 15, 'PERSON')]
    }),
     ('Who is Kamal Khumar?', {
        'entities': [(7, 19, 'PERSON')]
    }),
    ('I like London and Berlin.', {
        'entities': [(7, 13, 'LOC'), (18, 24, 'LOC')]
    })
]

### Defining the model

In [19]:
model = None #"en_core_web_sm"
output_dir=Path("ner")
n_iter=100

### Load the model


In [20]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Loaded model 'en_core_web_sm'


###Set up the pipeline


In [21]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

In [22]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

### Only train NER

In [23]:
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 3/3 [00:00<00:00, 32.88it/s]


{'ner': 5.814728645360993}


100%|██████████| 3/3 [00:00<00:00, 40.25it/s]


{'ner': 9.32632679263861}


100%|██████████| 3/3 [00:00<00:00, 37.60it/s]


{'ner': 7.288140892771814}


100%|██████████| 3/3 [00:00<00:00, 40.63it/s]


{'ner': 4.247559000502342}


100%|██████████| 3/3 [00:00<00:00, 40.52it/s]


{'ner': 5.972497153280477}


100%|██████████| 3/3 [00:00<00:00, 40.22it/s]


{'ner': 3.665076876705882}


100%|██████████| 3/3 [00:00<00:00, 35.59it/s]


{'ner': 4.64346996912151}


100%|██████████| 3/3 [00:00<00:00, 36.87it/s]


{'ner': 5.236079253787471}


100%|██████████| 3/3 [00:00<00:00, 38.13it/s]


{'ner': 0.3769508883428487}


100%|██████████| 3/3 [00:00<00:00, 36.81it/s]


{'ner': 6.215265891911221}


100%|██████████| 3/3 [00:00<00:00, 36.59it/s]


{'ner': 1.29673856970271}


100%|██████████| 3/3 [00:00<00:00, 39.94it/s]


{'ner': 0.8236923778624852}


100%|██████████| 3/3 [00:00<00:00, 39.90it/s]


{'ner': 0.5345205915741525}


100%|██████████| 3/3 [00:00<00:00, 36.45it/s]


{'ner': 1.777081610457121}


100%|██████████| 3/3 [00:00<00:00, 36.95it/s]


{'ner': 0.0341936191069591}


100%|██████████| 3/3 [00:00<00:00, 39.12it/s]


{'ner': 0.00698193952343615}


100%|██████████| 3/3 [00:00<00:00, 41.63it/s]


{'ner': 0.031098738263685567}


100%|██████████| 3/3 [00:00<00:00, 38.83it/s]


{'ner': 0.4533462037012106}


100%|██████████| 3/3 [00:00<00:00, 38.28it/s]


{'ner': 0.013294612441940235}


100%|██████████| 3/3 [00:00<00:00, 40.29it/s]


{'ner': 0.25971546912513443}


100%|██████████| 3/3 [00:00<00:00, 34.97it/s]


{'ner': 4.342251075143032}


100%|██████████| 3/3 [00:00<00:00, 34.66it/s]


{'ner': 0.10521023677967123}


100%|██████████| 3/3 [00:00<00:00, 39.01it/s]


{'ner': 0.007545049864503099}


100%|██████████| 3/3 [00:00<00:00, 38.14it/s]


{'ner': 0.09623341779641882}


100%|██████████| 3/3 [00:00<00:00, 39.14it/s]


{'ner': 0.10874286228347493}


100%|██████████| 3/3 [00:00<00:00, 38.24it/s]


{'ner': 0.04148425108277715}


100%|██████████| 3/3 [00:00<00:00, 37.44it/s]


{'ner': 1.6884883094677405}


100%|██████████| 3/3 [00:00<00:00, 39.25it/s]


{'ner': 1.392008750383656}


100%|██████████| 3/3 [00:00<00:00, 37.67it/s]


{'ner': 2.313562901862949e-05}


100%|██████████| 3/3 [00:00<00:00, 39.69it/s]


{'ner': 1.7417979652480293}


100%|██████████| 3/3 [00:00<00:00, 38.76it/s]


{'ner': 0.04582652137063518}


100%|██████████| 3/3 [00:00<00:00, 40.61it/s]


{'ner': 0.0010483049988545743}


100%|██████████| 3/3 [00:00<00:00, 37.44it/s]


{'ner': 0.20483488917971573}


100%|██████████| 3/3 [00:00<00:00, 35.71it/s]


{'ner': 2.65412654322985e-07}


100%|██████████| 3/3 [00:00<00:00, 36.86it/s]


{'ner': 9.993161175914639e-08}


100%|██████████| 3/3 [00:00<00:00, 39.22it/s]


{'ner': 0.08557848513895276}


100%|██████████| 3/3 [00:00<00:00, 38.04it/s]


{'ner': 7.640529240853827e-05}


100%|██████████| 3/3 [00:00<00:00, 39.46it/s]


{'ner': 0.04359993686053011}


100%|██████████| 3/3 [00:00<00:00, 39.60it/s]


{'ner': 1.2967407725285379e-06}


100%|██████████| 3/3 [00:00<00:00, 38.21it/s]


{'ner': 1.4231533790725012e-05}


100%|██████████| 3/3 [00:00<00:00, 38.67it/s]


{'ner': 1.334633361271575e-06}


100%|██████████| 3/3 [00:00<00:00, 36.87it/s]


{'ner': 0.0004915015119901365}


100%|██████████| 3/3 [00:00<00:00, 37.97it/s]


{'ner': 1.329253437543119e-07}


100%|██████████| 3/3 [00:00<00:00, 36.72it/s]


{'ner': 0.16386392832638505}


100%|██████████| 3/3 [00:00<00:00, 35.83it/s]


{'ner': 5.634091406233199e-06}


100%|██████████| 3/3 [00:00<00:00, 37.72it/s]


{'ner': 2.521570125251156e-08}


100%|██████████| 3/3 [00:00<00:00, 40.67it/s]


{'ner': 3.1377695714164083e-06}


100%|██████████| 3/3 [00:00<00:00, 36.91it/s]


{'ner': 0.006000663713294967}


100%|██████████| 3/3 [00:00<00:00, 37.92it/s]


{'ner': 1.6469580201618098e-05}


100%|██████████| 3/3 [00:00<00:00, 39.62it/s]


{'ner': 0.00812663572494082}


100%|██████████| 3/3 [00:00<00:00, 37.93it/s]


{'ner': 0.00038629976428045007}


100%|██████████| 3/3 [00:00<00:00, 37.12it/s]


{'ner': 1.6252684857234535e-08}


100%|██████████| 3/3 [00:00<00:00, 39.57it/s]


{'ner': 1.8531406236407852}


100%|██████████| 3/3 [00:00<00:00, 40.00it/s]


{'ner': 0.000440065276755914}


100%|██████████| 3/3 [00:00<00:00, 39.66it/s]


{'ner': 0.010659627009107657}


100%|██████████| 3/3 [00:00<00:00, 39.05it/s]


{'ner': 3.662660074393243e-05}


100%|██████████| 3/3 [00:00<00:00, 37.95it/s]


{'ner': 1.2124727313396941}


100%|██████████| 3/3 [00:00<00:00, 37.85it/s]


{'ner': 2.180459176205832e-06}


100%|██████████| 3/3 [00:00<00:00, 38.72it/s]


{'ner': 7.75263008659438e-07}


100%|██████████| 3/3 [00:00<00:00, 39.18it/s]


{'ner': 5.281247781062682e-07}


100%|██████████| 3/3 [00:00<00:00, 38.30it/s]


{'ner': 2.871462183589763e-09}


100%|██████████| 3/3 [00:00<00:00, 37.82it/s]


{'ner': 5.6384382023751115e-05}


100%|██████████| 3/3 [00:00<00:00, 37.62it/s]


{'ner': 9.180187258778123e-08}


100%|██████████| 3/3 [00:00<00:00, 38.86it/s]


{'ner': 1.9882659089829263e-06}


100%|██████████| 3/3 [00:00<00:00, 39.32it/s]


{'ner': 3.853828848452116e-09}


100%|██████████| 3/3 [00:00<00:00, 37.33it/s]


{'ner': 4.0825266540421105e-10}


100%|██████████| 3/3 [00:00<00:00, 41.66it/s]


{'ner': 4.266839982416233e-05}


100%|██████████| 3/3 [00:00<00:00, 36.07it/s]


{'ner': 2.943784438699055e-05}


100%|██████████| 3/3 [00:00<00:00, 36.48it/s]


{'ner': 1.3698409796190459e-08}


100%|██████████| 3/3 [00:00<00:00, 37.16it/s]


{'ner': 1.0083346941548412e-08}


100%|██████████| 3/3 [00:00<00:00, 38.78it/s]


{'ner': 5.2128743338351755e-08}


100%|██████████| 3/3 [00:00<00:00, 37.34it/s]


{'ner': 0.00023282940897538225}


100%|██████████| 3/3 [00:00<00:00, 35.43it/s]


{'ner': 0.005865463338858951}


100%|██████████| 3/3 [00:00<00:00, 38.17it/s]


{'ner': 4.4167216475096265e-06}


100%|██████████| 3/3 [00:00<00:00, 40.01it/s]


{'ner': 5.666833076045904e-07}


100%|██████████| 3/3 [00:00<00:00, 41.00it/s]


{'ner': 1.8496751591541665e-07}


100%|██████████| 3/3 [00:00<00:00, 39.04it/s]


{'ner': 1.2613043785630394e-07}


100%|██████████| 3/3 [00:00<00:00, 40.64it/s]


{'ner': 5.03991592264232e-08}


100%|██████████| 3/3 [00:00<00:00, 37.13it/s]


{'ner': 0.022567509378466438}


100%|██████████| 3/3 [00:00<00:00, 39.93it/s]


{'ner': 2.989123530389074e-08}


100%|██████████| 3/3 [00:00<00:00, 39.73it/s]


{'ner': 5.654469767682309e-07}


100%|██████████| 3/3 [00:00<00:00, 41.34it/s]


{'ner': 2.402832167365731e-09}


100%|██████████| 3/3 [00:00<00:00, 37.47it/s]


{'ner': 1.763104570392514e-06}


100%|██████████| 3/3 [00:00<00:00, 39.61it/s]


{'ner': 0.013535009720984368}


100%|██████████| 3/3 [00:00<00:00, 37.11it/s]


{'ner': 2.132855443563309e-06}


100%|██████████| 3/3 [00:00<00:00, 39.85it/s]


{'ner': 0.001928028177585498}


100%|██████████| 3/3 [00:00<00:00, 39.78it/s]


{'ner': 0.00041401857657590165}


100%|██████████| 3/3 [00:00<00:00, 39.81it/s]


{'ner': 8.900200552022921e-08}


100%|██████████| 3/3 [00:00<00:00, 40.85it/s]


{'ner': 7.466161807857967e-07}


100%|██████████| 3/3 [00:00<00:00, 38.56it/s]


{'ner': 4.572257409273527e-06}


100%|██████████| 3/3 [00:00<00:00, 35.88it/s]


{'ner': 0.02792520597285939}


100%|██████████| 3/3 [00:00<00:00, 38.28it/s]


{'ner': 1.4353659681998646e-07}


100%|██████████| 3/3 [00:00<00:00, 43.41it/s]


{'ner': 9.668343597387114e-08}


100%|██████████| 3/3 [00:00<00:00, 39.85it/s]


{'ner': 4.9806117450972944e-08}


100%|██████████| 3/3 [00:00<00:00, 38.03it/s]


{'ner': 0.1304367294122607}


100%|██████████| 3/3 [00:00<00:00, 37.91it/s]


{'ner': 4.422529733745852e-08}


100%|██████████| 3/3 [00:00<00:00, 40.09it/s]


{'ner': 1.3810032013754774e-09}


100%|██████████| 3/3 [00:00<00:00, 42.48it/s]


{'ner': 2.5857951718303463e-05}


100%|██████████| 3/3 [00:00<00:00, 45.07it/s]


{'ner': 1.285908143170253e-09}


100%|██████████| 3/3 [00:00<00:00, 39.51it/s]

{'ner': 5.4976370946033e-08}





In [24]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Nishanth', 'PERSON')]
Entities [('London', 'LOC'), ('Berlin', 'LOC')]
Entities [('Kamal Khumar', 'PERSON')]


### Saving the spaCy model

In [25]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to ner


In [26]:
m = spacy.load('/content/ner')