**<center><font size = "5">Create custom NER model<center>**
***


In [None]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy)
  Downloading thinc-8.2.1-cp310-cp310-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsl

In [None]:
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.0/en_core_web_lg-3.7.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
from __future__ import unicode_literals, print_function

import pandas as pd
import os, time

from pathlib import Path
from tqdm import tqdm 
from spacy.training import Example
from spacy import displacy
import spacy
import base64

### NER pipeline

In [None]:
nlpSpacy = spacy.load("en_core_web_lg")
 
print(nlpSpacy.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
file = open('sampletext.txt')
text = file.read()
file.close()
doc = nlpSpacy(text)
displacy.render(doc, style="ent", jupyter=True)

In [None]:
for entity in doc.ents:
  print(f"Name of the entity {entity.text} type is {entity.label_}")

Name of the entity Electric type is ORG
Name of the entity Tesla type is ORG
Name of the entity Super Bowl type is EVENT
Name of the entity Sunday type is DATE
Name of the entity Autopilot type is ORG
Name of the entity Tesla type is ORG
Name of the entity Model S type is PRODUCT
Name of the entity Tesla type is ORG
Name of the entity Palo Alto type is GPE
Name of the entity California type is GPE
Name of the entity October 14, 2015 type is DATE
Name of the entity Tesla type is ORG
Name of the entity DOJ type is ORG
Name of the entity Autopilot type is ORG
Name of the entity Washington type is GPE
Name of the entity DC type is GPE
Name of the entity Austin type is GPE
Name of the entity Tallahassee type is GPE
Name of the entity Albany type is GPE
Name of the entity Atlanta type is GPE
Name of the entity Sacramento type is GPE
Name of the entity Tesla type is ORG
Name of the entity multimillion dollar type is MONEY
Name of the entity The Dawn Project type is ORG
Name of the entity Dan 

## Annotate the text for NER 

Navigate to 

https://tecoholic.github.io/ner-annotator/

and make all the custom NER and annotate the entities on the text one by one

## Make a Spacy custom NER MODEL

In [None]:
import json
 
with open('annotations.json', 'r') as f:
    data = json.load(f)
    
print(data['annotations'][0])

['Electric carmaker Tesla will face a hit on Super Bowl Sunday, when an ad will play showing the alleged dangers of its Full Self-Driving technology.', {'entities': [[0, 17, 'SUPPLIER'], [18, 23, 'ORG'], [43, 53, 'EVENT'], [54, 60, 'DAY'], [123, 135, 'FEATURE']]}]


In [None]:
data['classes']

['ORG',
 'PRODUCT',
 'MONEY',
 'TECHNOLOGY',
 'EVENT',
 'DAY',
 'LOCATION',
 'DATE',
 'DEPARTMENT',
 'FEATURE',
 'PERSON',
 'ROLE',
 'OBJECT',
 'PROJECT',
 'NUMBER',
 'CUSTOMERS',
 'BLOGS',
 'SUPPLIER']

## Create empty model

In [None]:
training_data = data.copy()
training_data['classes'] = data['classes']
training_data['annotations'] = []
for text, annotation in data['annotations']:
    try:
        if text!="" and len(annotation['entities']) > 0:
            temp_dict = {}
            temp_dict['text'] = text
            temp_dict['entities'] = []
            for ent in annotation['entities']:
                start = ent[0]
                end = ent[1]
                label = ent[2].upper()
                temp_dict['entities'].append((start, end, label))
        training_data['annotations'].append(temp_dict)
    except:
        pass
print(training_data)

{'classes': ['ORG', 'PRODUCT', 'MONEY', 'TECHNOLOGY', 'EVENT', 'DAY', 'LOCATION', 'DATE', 'DEPARTMENT', 'FEATURE', 'PERSON', 'ROLE', 'OBJECT', 'PROJECT', 'NUMBER', 'CUSTOMERS', 'BLOGS', 'SUPPLIER'], 'annotations': [{'text': 'Electric carmaker Tesla will face a hit on Super Bowl Sunday, when an ad will play showing the alleged dangers of its Full Self-Driving technology.', 'entities': [(0, 17, 'SUPPLIER'), (18, 23, 'ORG'), (43, 53, 'EVENT'), (54, 60, 'DAY'), (123, 135, 'FEATURE')]}, {'text': 'The Autopilot features demonstrated in a Tesla Model S during a Tesla event in Palo Alto, California October 14, 2015. ', 'entities': [(4, 13, 'FEATURE'), (41, 46, 'ORG'), (47, 54, 'PRODUCT'), (64, 69, 'ORG'), (79, 88, 'LOCATION'), (90, 100, 'LOCATION'), (101, 117, 'DATE')]}, {'text': "Tesla confirms DOJ has requested documents on Autopilot, 'Full Self-Driving'", 'entities': [(0, 5, 'ORG'), (15, 18, 'DEPARTMENT'), (46, 55, 'FEATURE'), (57, 75, 'FEATURE')]}, {'text': 'The commercial, which will be a

### Configuration variables

In [None]:
modelSpacy = None
n_iter=100

### Load the model

In [None]:
if modelSpacy is not None:
    nlp = spacy.load(modelSpacy)  
    print("Loaded model '%s'" % modelSpacy)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

Created blank 'en' model


### Set up pipeline

In [None]:
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

### Train the NER model

In [None]:
#getting all the entities
for annotations in training_data["annotations"]:
    for ent in annotations['entities']:
        ner.add_label(ent[2])
        
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        losses = {}
        for text, annotations in tqdm(data["annotations"]):
            try:
                if text!="":
                    example = Example.from_dict(nlp.make_doc(text), annotations)
                    #Update the model
                    nlp.update(
                        [example],  
                        drop=0.5,  
                        sgd=optimizer,
                        losses=losses)
            except:
                pass
        print(losses)


100%|██████████| 25/25 [00:01<00:00, 19.85it/s]


{'ner': 748.4617953865836}


100%|██████████| 25/25 [00:01<00:00, 17.51it/s]


{'ner': 295.5354982531711}


100%|██████████| 25/25 [00:01<00:00, 19.32it/s]


{'ner': 276.219936227375}


100%|██████████| 25/25 [00:00<00:00, 25.96it/s]


{'ner': 254.6758886459678}


100%|██████████| 25/25 [00:00<00:00, 27.04it/s]


{'ner': 290.75711990978016}


100%|██████████| 25/25 [00:00<00:00, 25.91it/s]


{'ner': 227.48829230972888}


100%|██████████| 25/25 [00:00<00:00, 27.14it/s]


{'ner': 230.04325941953442}


100%|██████████| 25/25 [00:00<00:00, 25.61it/s]


{'ner': 236.4903372984402}


100%|██████████| 25/25 [00:00<00:00, 27.21it/s]


{'ner': 215.66293260127426}


100%|██████████| 25/25 [00:00<00:00, 26.14it/s]


{'ner': 220.82507084895417}


100%|██████████| 25/25 [00:00<00:00, 26.53it/s]


{'ner': 209.6194882925077}


100%|██████████| 25/25 [00:00<00:00, 26.25it/s]


{'ner': 199.7653460741328}


100%|██████████| 25/25 [00:00<00:00, 26.34it/s]


{'ner': 213.5018476710399}


100%|██████████| 25/25 [00:01<00:00, 18.52it/s]


{'ner': 203.07543132857109}


100%|██████████| 25/25 [00:01<00:00, 17.89it/s]


{'ner': 221.4873628444826}


100%|██████████| 25/25 [00:01<00:00, 22.97it/s]


{'ner': 210.0257306688189}


100%|██████████| 25/25 [00:00<00:00, 26.04it/s]


{'ner': 196.34271177628284}


100%|██████████| 25/25 [00:00<00:00, 25.83it/s]


{'ner': 189.43689078876375}


100%|██████████| 25/25 [00:00<00:00, 27.03it/s]


{'ner': 196.69654884490552}


100%|██████████| 25/25 [00:00<00:00, 26.17it/s]


{'ner': 210.56067387696152}


100%|██████████| 25/25 [00:00<00:00, 27.43it/s]


{'ner': 181.4064514204245}


100%|██████████| 25/25 [00:00<00:00, 26.55it/s]


{'ner': 167.7858475694419}


100%|██████████| 25/25 [00:00<00:00, 26.18it/s]


{'ner': 171.40302146127684}


100%|██████████| 25/25 [00:00<00:00, 26.35it/s]


{'ner': 205.880364754025}


100%|██████████| 25/25 [00:00<00:00, 26.52it/s]


{'ner': 162.44772163482813}


100%|██████████| 25/25 [00:01<00:00, 22.74it/s]


{'ner': 184.349373530244}


100%|██████████| 25/25 [00:01<00:00, 18.16it/s]


{'ner': 165.72889799107642}


100%|██████████| 25/25 [00:01<00:00, 17.87it/s]


{'ner': 148.17040467299324}


100%|██████████| 25/25 [00:00<00:00, 26.93it/s]


{'ner': 183.70820573867584}


100%|██████████| 25/25 [00:00<00:00, 26.56it/s]


{'ner': 175.8970582360657}


100%|██████████| 25/25 [00:00<00:00, 26.71it/s]


{'ner': 194.53793988806507}


100%|██████████| 25/25 [00:00<00:00, 25.60it/s]


{'ner': 204.6579812340621}


100%|██████████| 25/25 [00:00<00:00, 26.89it/s]


{'ner': 131.9276519289242}


100%|██████████| 25/25 [00:00<00:00, 26.15it/s]


{'ner': 170.5381959736376}


100%|██████████| 25/25 [00:00<00:00, 26.64it/s]


{'ner': 163.329497571991}


100%|██████████| 25/25 [00:00<00:00, 26.09it/s]


{'ner': 140.81443263557742}


100%|██████████| 25/25 [00:00<00:00, 26.97it/s]


{'ner': 130.48414428250084}


100%|██████████| 25/25 [00:00<00:00, 26.00it/s]


{'ner': 124.0642152491898}


100%|██████████| 25/25 [00:01<00:00, 19.98it/s]


{'ner': 120.24457663628608}


100%|██████████| 25/25 [00:01<00:00, 16.94it/s]


{'ner': 119.20747507676313}


100%|██████████| 25/25 [00:01<00:00, 20.94it/s]


{'ner': 132.75761403059818}


100%|██████████| 25/25 [00:01<00:00, 24.84it/s]


{'ner': 143.4504506339367}


100%|██████████| 25/25 [00:00<00:00, 25.71it/s]


{'ner': 144.53147978878675}


100%|██████████| 25/25 [00:00<00:00, 26.37it/s]


{'ner': 107.07568383302973}


100%|██████████| 25/25 [00:00<00:00, 26.55it/s]


{'ner': 115.00102487710323}


100%|██████████| 25/25 [00:00<00:00, 26.54it/s]


{'ner': 131.69571948372229}


100%|██████████| 25/25 [00:00<00:00, 26.39it/s]


{'ner': 140.098211457698}


100%|██████████| 25/25 [00:00<00:00, 25.90it/s]


{'ner': 106.27626000547103}


100%|██████████| 25/25 [00:00<00:00, 26.38it/s]


{'ner': 109.69660262030654}


100%|██████████| 25/25 [00:00<00:00, 26.44it/s]


{'ner': 100.96991943410225}


100%|██████████| 25/25 [00:00<00:00, 25.97it/s]


{'ner': 116.84049077383524}


100%|██████████| 25/25 [00:01<00:00, 17.90it/s]


{'ner': 137.84617577510744}


100%|██████████| 25/25 [00:01<00:00, 17.47it/s]


{'ner': 111.35589389784579}


100%|██████████| 25/25 [00:00<00:00, 25.62it/s]


{'ner': 96.9012030959204}


100%|██████████| 25/25 [00:00<00:00, 25.75it/s]


{'ner': 101.61122999578954}


100%|██████████| 25/25 [00:00<00:00, 26.43it/s]


{'ner': 140.87886136086453}


100%|██████████| 25/25 [00:00<00:00, 25.27it/s]


{'ner': 128.7502758077796}


100%|██████████| 25/25 [00:00<00:00, 26.07it/s]


{'ner': 104.70618897997515}


100%|██████████| 25/25 [00:00<00:00, 27.11it/s]


{'ner': 105.10660676420456}


100%|██████████| 25/25 [00:00<00:00, 25.76it/s]


{'ner': 83.46770693323508}


100%|██████████| 25/25 [00:00<00:00, 26.42it/s]


{'ner': 77.83148853867121}


100%|██████████| 25/25 [00:01<00:00, 23.96it/s]


{'ner': 86.9477299718239}


100%|██████████| 25/25 [00:00<00:00, 26.15it/s]


{'ner': 106.38475004122571}


100%|██████████| 25/25 [00:01<00:00, 19.01it/s]


{'ner': 91.34603008718986}


100%|██████████| 25/25 [00:01<00:00, 16.96it/s]


{'ner': 94.53451296860365}


100%|██████████| 25/25 [00:01<00:00, 21.72it/s]


{'ner': 98.97622944613329}


100%|██████████| 25/25 [00:00<00:00, 25.59it/s]


{'ner': 75.50563742258412}


100%|██████████| 25/25 [00:00<00:00, 26.84it/s]


{'ner': 80.92290478465307}


100%|██████████| 25/25 [00:00<00:00, 26.10it/s]


{'ner': 85.80531138515774}


100%|██████████| 25/25 [00:00<00:00, 26.45it/s]


{'ner': 77.90757617392256}


100%|██████████| 25/25 [00:00<00:00, 25.76it/s]


{'ner': 97.65459656756312}


100%|██████████| 25/25 [00:00<00:00, 26.50it/s]


{'ner': 84.1426615833286}


100%|██████████| 25/25 [00:00<00:00, 25.60it/s]


{'ner': 84.82280675814505}


100%|██████████| 25/25 [00:00<00:00, 26.16it/s]


{'ner': 85.10677967967452}


100%|██████████| 25/25 [00:00<00:00, 26.08it/s]


{'ner': 70.51353634915854}


100%|██████████| 25/25 [00:01<00:00, 22.58it/s]


{'ner': 59.2100855173632}


100%|██████████| 25/25 [00:01<00:00, 17.12it/s]


{'ner': 91.02937921234323}


100%|██████████| 25/25 [00:01<00:00, 18.34it/s]


{'ner': 64.99044893658254}


100%|██████████| 25/25 [00:00<00:00, 26.41it/s]


{'ner': 63.47271730007837}


100%|██████████| 25/25 [00:00<00:00, 26.56it/s]


{'ner': 71.00780687030141}


100%|██████████| 25/25 [00:00<00:00, 26.22it/s]


{'ner': 58.30531304523572}


100%|██████████| 25/25 [00:00<00:00, 25.93it/s]


{'ner': 57.6053961959631}


100%|██████████| 25/25 [00:00<00:00, 25.17it/s]


{'ner': 74.45103866132258}


100%|██████████| 25/25 [00:00<00:00, 26.21it/s]


{'ner': 54.22775365158326}


100%|██████████| 25/25 [00:00<00:00, 25.91it/s]


{'ner': 56.8708003373803}


100%|██████████| 25/25 [00:00<00:00, 25.96it/s]


{'ner': 64.34052769627421}


100%|██████████| 25/25 [00:00<00:00, 25.51it/s]


{'ner': 73.51520646394404}


100%|██████████| 25/25 [00:00<00:00, 26.44it/s]


{'ner': 78.83168242929669}


100%|██████████| 25/25 [00:01<00:00, 18.22it/s]


{'ner': 54.867912250302325}


100%|██████████| 25/25 [00:01<00:00, 17.27it/s]


{'ner': 63.26570031671061}


100%|██████████| 25/25 [00:01<00:00, 22.70it/s]


{'ner': 68.75163914068843}


100%|██████████| 25/25 [00:00<00:00, 25.67it/s]


{'ner': 67.50204811609842}


100%|██████████| 25/25 [00:00<00:00, 26.26it/s]


{'ner': 64.30260759080613}


100%|██████████| 25/25 [00:00<00:00, 25.66it/s]


{'ner': 56.38713854686439}


100%|██████████| 25/25 [00:00<00:00, 27.05it/s]


{'ner': 70.53448033334185}


100%|██████████| 25/25 [00:00<00:00, 25.99it/s]


{'ner': 69.67390552577537}


100%|██████████| 25/25 [00:00<00:00, 26.62it/s]


{'ner': 58.94006857365791}


100%|██████████| 25/25 [00:00<00:00, 25.69it/s]


{'ner': 59.07130119329136}


100%|██████████| 25/25 [00:00<00:00, 26.05it/s]


{'ner': 49.040806157288465}


100%|██████████| 25/25 [00:00<00:00, 26.20it/s]

{'ner': 53.742256340147144}





### Test the trained model

In [None]:
for annotations in training_data["annotations"]:
    doc = nlp(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Electric carmaker', 'SUPPLIER'), ('Tesla', 'ORG'), ('Super Bowl', 'EVENT'), ('Sunday', 'DAY'), ('Self-Driving', 'FEATURE')]
Tokens [('Electric', 'SUPPLIER', 3), ('carmaker', 'SUPPLIER', 1), ('Tesla', 'ORG', 3), ('will', '', 2), ('face', '', 2), ('a', '', 2), ('hit', '', 2), ('on', '', 2), ('Super', 'EVENT', 3), ('Bowl', 'EVENT', 1), ('Sunday', 'DAY', 3), (',', '', 2), ('when', '', 2), ('an', '', 2), ('ad', '', 2), ('will', '', 2), ('play', '', 2), ('showing', '', 2), ('the', '', 2), ('alleged', '', 2), ('dangers', '', 2), ('of', '', 2), ('its', '', 2), ('Full', '', 2), ('Self', 'FEATURE', 3), ('-', 'FEATURE', 1), ('Driving', 'FEATURE', 1), ('technology', '', 2), ('.', '', 2)]
Entities [('Autopilot', 'FEATURE'), ('Tesla', 'ORG'), ('Model S', 'PRODUCT'), ('Tesla', 'ORG'), ('Palo Alto', 'LOCATION'), ('California', 'LOCATION'), ('October 14, 2015', 'DATE')]
Tokens [('The', '', 2), ('Autopilot', 'FEATURE', 3), ('features', '', 2), ('demonstrated', '', 2), ('in', '', 2), ('a', ''

### Save the cusom NER model

In [None]:
output_dir= os.path.abspath(os.getcwd())
nlp.to_disk(output_dir)
print("Saved model to", output_dir) 

Saved model to /content


### Test the saved custom model

In [None]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for annotations in training_data["annotations"][3:5]:
    doc = nlp2(annotations['text'])
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
    print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from /content
Entities [('Washington', 'LOCATION'), ('DC', 'LOCATION'), ('Austin', 'LOCATION'), ('Tallahassee', 'LOCATION'), ('Albany', 'LOCATION'), ('Atlanta', 'LOCATION'), ('Sacramento', 'LOCATION'), ('Tesla', 'ORG'), ('best light.', 'FEATURE'), ('multimillion dollar', 'MONEY'), ('advertising', 'DEPARTMENT'), ('Dawn Project.', 'DEPARTMENT'), ('Dan O’Dowd', 'PERSON'), ('California', 'LOCATION'), ('CEO', 'ROLE'), ('US Senate', 'DEPARTMENT')]
Tokens [('The', '', 2), ('commercial', '', 2), (',', '', 2), ('which', '', 2), ('will', '', 2), ('be', '', 2), ('aired', '', 2), ('in', '', 2), ('Washington', 'LOCATION', 3), (',', '', 2), ('DC', 'LOCATION', 3), (',', '', 2), ('Austin', 'LOCATION', 3), (',', '', 2), ('Tallahassee', 'LOCATION', 3), (',', '', 2), ('Albany', 'LOCATION', 3), (',', '', 2), ('Atlanta', 'LOCATION', 3), ('and', '', 2), ('Sacramento', 'LOCATION', 3), ('does', '', 2), ('not', '', 2), ('paint', '', 2), ('Tesla', 'ORG', 3), ('in', '', 2), ('the', '', 2), ('best', 'FEAT

In [None]:
file = open('sampletext.txt')
text = file.read()
file.close()
nlp2 = spacy.load(output_dir)
doc = nlp2(text)
displacy.render(doc, style="ent", jupyter=True)

In [None]:
with open('Restaurant_Reviews.tsv', 'r') as file:
    lines = file.readlines()

lines.pop(0)
data = []
for line in lines:
    text, label = line.strip().split('\t')
    data.append((text, int(label)))

In [None]:
df = pd.DataFrame(data, columns=['text', 'label'])

# Check the first few rows of the DataFrame
print(df.head())

                                                text  label
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


In [None]:
train_x = df.text

In [None]:
train_x.to_csv('output.txt', sep='\t', index=False, header=False)

In [None]:
file = open('output.txt')
text = file.read()
file.close()
doc = nlpSpacy(text)
displacy.render(doc, style="ent", jupyter=True)