## Named Entity Recognition

- Information Extraction
- Detect and classify the named entities in unstructured data

In [None]:
#load spacy
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [None]:
import spacy.cli

spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
# Process whole documents
text = ("Can you send a message to 876786868 with the message 'Hello, how are you?'")
doc = nlp(text)

In [None]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['you', 'a message', 'the message', 'you']
Verbs: ['send']


In [None]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

### Visualization

In [None]:
from spacy import displacy

In [None]:
displacy.render(doc, style="ent", jupyter=True)



In [None]:
train = train = [
    ("Send a message to +15551234567 with the message 'Hello'", {"entities": [(18, 30, "RECIPIENT_PHONE"), (49, 54, "MESSAGE")]}),
    ("Send a message to +15551234567 with the message: 'Hello'", {"entities": [(18, 30, "RECIPIENT_PHONE"), (50, 55, "MESSAGE")]}),
    ("Text +447891234567: 'Can you call me?'", {"entities": [(5, 18, "RECIPIENT_PHONE"), (20, 36, "MESSAGE")]}),
    ("Text +447891234567 'Can you call me?'", {"entities": [(5, 18, "RECIPIENT_PHONE"), (23, 36, "MESSAGE")]}),
    ("Send a text to +919876543210: 'I'll be there soon'", {"entities": [(15, 28, "RECIPIENT_PHONE"), (31, 49, "MESSAGE")]}),

    ("Send a text to +919876543210 'I'll be there soon'", {"entities": [(15, 28, "RECIPIENT_PHONE"), (30, 48, "MESSAGE")]}),

    ("Send the message 'Do you have any plans tonight?' to +336987654321.", {"entities": [(18, 48, "MESSAGE"), (53, 66, "RECIPIENT_PHONE")]}),

    ("Send the message: 'Do you have any plans tonight?' to +336987654321.", {"entities": [(19, 49, "MESSAGE"), (54, 67, "RECIPIENT_PHONE")]}),

    ("Text +819876543210: 'Remember to bring your ID.'", {"entities": [(5, 18, "RECIPIENT_PHONE"), (21, 47, "MESSAGE")]}),

    ("Text +819876543210 'Remember to bring your ID.'", {"entities": [(5, 18, "RECIPIENT_PHONE"), (20, 46, "MESSAGE")]}),

    ("Send a message to +4987654321098 with the content: 'Don't forget to buy milk.'", {"entities": [(18, 32, "RECIPIENT_PHONE"), (52, 77, "MESSAGE")]}),

    ("Send a message to +4987654321098 with the content 'Don't forget to buy milk.'", {"entities": [(18, 32, "RECIPIENT_PHONE"), (51, 76, "MESSAGE")]}),

    ("Send a message to +5551987654321: 'I need your help.'", {"entities": [(18, 32, "RECIPIENT_PHONE"), (35, 52, "MESSAGE")]}),

    ("Send a message to +5551987654321 'I need your help.'", {"entities": [(18, 34, "RECIPIENT_PHONE"), (34, 51, "MESSAGE")]}),

    ("Text +447612345678 and ask: 'Have you received the package?", {"entities": [(5, 18, "RECIPIENT_PHONE"), (29, 59, "MESSAGE")]}),

    ("Send a message to +6176543210987 saying 'Congratulations!'", {"entities": [(18, 32, "RECIPIENT_PHONE"), (41, 57, "MESSAGE")]}),

    ("Send the message 'Can we meet tomorrow?' to +639987654321.", {"entities": [(18, 39, "MESSAGE"), (44, 57, "RECIPIENT_PHONE")]}),

    ("Please text +27123456789", {"entities": [(12, 24, "RECIPIENT_PHONE")]}),

    ("Text +441234567890: 'Let's grab dinner tonight.'", {"entities": [(5, 18, "RECIPIENT_PHONE"), (21, 47, "MESSAGE")]}),

    ("Send a message to +4912345678901 with the text 'Sorry for the delay.'", {"entities": [(18, 32, "RECIPIENT_PHONE"), (48, 68, "MESSAGE")]}),
]
# Print the updated training data
print(train)


[("Send a message to +15551234567 with the message 'Hello'", {'entities': [(18, 30, 'RECIPIENT_PHONE'), (49, 54, 'MESSAGE')]}), ("Send a message to +15551234567 with the message: 'Hello'", {'entities': [(18, 30, 'RECIPIENT_PHONE'), (50, 55, 'MESSAGE')]}), ("Text +447891234567: 'Can you call me?'", {'entities': [(5, 18, 'RECIPIENT_PHONE'), (20, 36, 'MESSAGE')]}), ("Text +447891234567 'Can you call me?'", {'entities': [(5, 18, 'RECIPIENT_PHONE'), (23, 36, 'MESSAGE')]}), ("Send a text to +919876543210: 'I'll be there soon'", {'entities': [(15, 28, 'RECIPIENT_PHONE'), (31, 49, 'MESSAGE')]}), ("Send a text to +919876543210 'I'll be there soon'", {'entities': [(15, 28, 'RECIPIENT_PHONE'), (30, 48, 'MESSAGE')]}), ("Send the message 'Do you have any plans tonight?' to +336987654321.", {'entities': [(18, 48, 'MESSAGE'), (53, 66, 'RECIPIENT_PHONE')]}), ("Send the message: 'Do you have any plans tonight?' to +336987654321.", {'entities': [(19, 49, 'MESSAGE'), (54, 67, 'RECIPIENT_PHONE')]}), ("Tex

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from spacy.tokens import DocBin

db = DocBin() # create a DocBin object

for text, annot in tqdm(train): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk("./train.spacy") # save the docbin object

100%|██████████| 20/20 [00:00<00:00, 770.86it/s]


In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-05-26 15:29:15,405] [INFO] Set up nlp object from config
[2023-05-26 15:29:15,423] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-05-26 15:29:15,428] [INFO] Created vocabulary
[2023-05-26 15:29:19,029] [INFO] Added vectors: en_core_web_lg
[2023-05-26 15:29:22,746] [INFO] Finished initializing nlp object
[2023-05-26 15:29:23,127] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     52.00    0.00    0.00    0.00    0.00
 78     200          3.86    762.26  100.00  100.00  100.00    1.00
178     400          0.00      0.00  100.00  100.00  100.00    1.00
278     600    

In [None]:
nlp1 = spacy.load(r"./output/model-best") #load the best model

doc = nlp1("Can you send a message to 8769786868 with the text 'Hello, how are you?'") # input sample text

doc.ents

(you send a message to 8769786868 with the text, Hello, how are you?)

In [None]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

you send a message to 8769786868 with the text MESSAGE
Hello, how are you? MESSAGE


In [None]:
doc = nlp1("Send a message to 8769786868 with the message 'Hello, how are you?'") # input sample text

doc.ents

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

8769786868 RECIPIENT_PHONE
Hello, how are you? MESSAGE


In [None]:
doc = nlp1("Send a the message 'Hello, how are you?' to 6789433789") # input sample text

doc.ents

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Hello, how are you? MESSAGE
to 6789433789 MESSAGE


In [None]:
doc = nlp1("Send a message to +15551234567 with the message: 'Hello'") # input sample text

doc.ents

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)


+15551234567 RECIPIENT_PHONE
Hello MESSAGE


In [None]:
!zip -r /content/model2.zip /content/output/model-best

  adding: content/output/model-best/ (stored 0%)
  adding: content/output/model-best/tokenizer (deflated 81%)
  adding: content/output/model-best/vocab/ (stored 0%)
  adding: content/output/model-best/vocab/vectors (deflated 8%)
  adding: content/output/model-best/vocab/strings.json (deflated 77%)
  adding: content/output/model-best/vocab/vectors.cfg (stored 0%)
  adding: content/output/model-best/vocab/lookups.bin (stored 0%)
  adding: content/output/model-best/vocab/key2row (deflated 16%)
  adding: content/output/model-best/tok2vec/ (stored 0%)
  adding: content/output/model-best/tok2vec/cfg (stored 0%)
  adding: content/output/model-best/tok2vec/model (deflated 8%)
  adding: content/output/model-best/ner/ (stored 0%)
  adding: content/output/model-best/ner/moves (deflated 55%)
  adding: content/output/model-best/ner/cfg (deflated 33%)
  adding: content/output/model-best/ner/model (deflated 8%)
  adding: content/output/model-best/config.cfg (deflated 60%)
  adding: content/output/mod

In [None]:
from google.colab import files
files.download("/content/model2.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>