In [None]:
# Application of Named Entity Recognition (NER) in real life
# 1. Search
# 2. Recommendations
# 3. Customer Care

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [6]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | ORG | Companies, agencies, institutions, etc.
Twitter Inc | ORG | Companies, agencies, institutions, etc.
$45 billion | MONEY | Monetary values, including unit


In [7]:
# for visually rendering
from spacy import displacy

displacy.render(doc, style="ent")

In [8]:
# print all the entities that spacy supports
nlp.pipe_labels['ner']
# source: https://spacy.io/models/en

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [12]:
doc = nlp("Michael Bloomberg founded Bloomberg in 1982")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

# Above it made a mistake in identifying Bloomberg the company. Let's try hugging face for this now.

# https://huggingface.co/dslim/bert-base-NER?text=Michael+Bloomberg+founded+Bloomberg+in+1982

Michael Bloomberg | PERSON | People, including fictional
Bloomberg L.P | PERSON | People, including fictional
1982 | DATE | Absolute or relative dates or periods


In [13]:
# Here also go through 3 sample examples for NER
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", ent.start_char, "|", ent.end_char)

Tesla Inc  |  ORG  |  0 | 9
Twitter Inc  |  ORG  |  30 | 41
$45 billion  |  MONEY  |  46 | 57


In [14]:
# ========== Setting custom entities ============
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  PRODUCT
$45 billion  |  MONEY


In [16]:
doc[0]
type(doc[0]) 

spacy.tokens.token.Token

In [20]:
# get words in the 2th -> 4th place (not include 5th)
doc[2:5]

going to acquire

In [19]:
type(doc[2:5])

spacy.tokens.span.Span

In [23]:
from spacy.tokens import Span

# get token[0->1](not include 1) => Tesla
s1 = Span(doc, 0, 1, label="ORG")
# get token[5->6](not include 6) => Twitter
s2 = Span(doc, 5, 6, label="ORG")

# specify all the entities that you want to set in the bracket []
doc.set_ents([s1, s2], default="unmodified")

In [24]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY
