<h2 align="center">Named Entity Recognition (NER)</h2>

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [2]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
doc = nlp("Tata Services going to acquire Zudio for $25 billion")

for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ", spacy.explain(ent.label_))

Tata Services  |  ORG  |  Companies, agencies, institutions, etc.
Zudio  |  GPE  |  Countries, cities, states
$25 billion  |  MONEY  |  Monetary values, including unit


In [6]:
from spacy import displacy

displacy.render(doc, style="ent")

In [9]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [12]:
doc = nlp("Michael Bloomberg founded Bloomberg Inc in 1982")

displacy.render(doc, style="ent")

<h3>Setting custom entities<h3></h3>

In [13]:
#concepts of span
doc = nlp("Tata Services going to acquire Zudio for $25 billion")

doc [2:5]


going to acquire

In [14]:
type(doc [2:5])

spacy.tokens.span.Span

In [15]:
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

In [16]:
s1

Tata

In [17]:
s2

Zudio

In [18]:
doc.set_ents([s1, s2], default="unmodified")

In [19]:
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ", spacy.explain(ent.label_))

Tata Services  |  ORG  |  Companies, agencies, institutions, etc.
Zudio  |  ORG  |  Companies, agencies, institutions, etc.
$25 billion  |  MONEY  |  Monetary values, including unit


In [None]:
#Build Own NER

#We use entity ruler for this

#https://spacy.io/api/entityruler/
#https://python-textbook.pythonhumanities.com/03_spacy/03_02_02_entityruler.html#introducing-complex-rules-and-variance-to-the-entityruler-advanced

In [None]:
#approch 3: machine Learning - CRF ( Condtional random Fields) - BERT

**Excersie: 1**

* Extract all the Geographical (cities, Countries, states) names from a given text

In [36]:
text = """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc = nlp(text)

In [37]:
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ", spacy.explain(ent.label_))

Kiran  |  ORG  |  Companies, agencies, institutions, etc.
India  |  GPE  |  Countries, cities, states
Google  |  ORG  |  Companies, agencies, institutions, etc.
Google  |  ORG  |  Companies, agencies, institutions, etc.
Delhi  |  GPE  |  Countries, cities, states
Chaat  |  ORG  |  Companies, agencies, institutions, etc.
Gujarat  |  GPE  |  Countries, cities, states
Dal Dhokli  |  PERSON  |  People, including fictional
Tamilnadu  |  GPE  |  Countries, cities, states
Pongal  |  GPE  |  Countries, cities, states
Andhrapradesh  |  GPE  |  Countries, cities, states
Biryani  |  PERSON  |  People, including fictional
Assam  |  GPE  |  Countries, cities, states
Papaya Khar  |  PERSON  |  People, including fictional
Bihar  |  GPE  |  Countries, cities, states
Litti Chowkha  |  PERSON  |  People, including fictional


In [38]:
doc[45]

Pongal

In [39]:
#s1 = Span(doc, 45, 46, label="ORG")
#I need to remove ponagla as a food,

Geographical_areas= []
for ent in doc.ents:
    if ent.label_ == "GPE":
        Geographical_areas.append(ent)

Geographical_areas = [str(area) for area in Geographical_areas]
print("Geographical ares in given text:")
print(", ".join(Geographical_areas))

Geographical ares in given text:
India, Delhi, Gujarat, Tamilnadu, Pongal, Andhrapradesh, Assam, Bihar


**Excersie: 2**
* Extract all the birth dates of cricketers in the given Text

In [40]:
text = """Sachin Tendulkar was born on 24 April 1973, Virat Kholi was born on 5 November 1988, Dhoni was born on 7 July 1981
and finally Ricky ponting was born on 19 December 1974."""

doc = nlp(text)

In [41]:
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ", spacy.explain(ent.label_))

Sachin Tendulkar  |  PERSON  |  People, including fictional
24 April 1973  |  DATE  |  Absolute or relative dates or periods
Virat Kholi  |  LOC  |  Non-GPE locations, mountain ranges, bodies of water
5 November 1988  |  DATE  |  Absolute or relative dates or periods
Dhoni  |  PERSON  |  People, including fictional
7 July 1981  |  DATE  |  Absolute or relative dates or periods
Ricky  |  PERSON  |  People, including fictional
19 December 1974  |  DATE  |  Absolute or relative dates or periods


In [43]:
date_of_births= []
for ent in doc.ents:
    if ent.label_ == "DATE":
        date_of_births.append(ent)

print("Date of Births in given text:", date_of_births)
print("Number of DOB in text", len(date_of_births))


Date of Births in given text: [24 April 1973, 5 November 1988, 7 July 1981, 19 December 1974]
Number of DOB in text 4


<h2>Creating Customized entity for Food</h2>

In [47]:
import spacy
from spacy.training import Example

# Load a pre-trained spaCy model
nlp = spacy.blank("en")

# Add named entity recognizer to the pipeline
ner = nlp.add_pipe("ner")

# Define food types as a new entity label
ner.add_label("FOOD")

1

In [49]:
# Example training data
TRAIN_DATA = [
    ("I love pizza.", {"entities": [(7, 12, "FOOD")]}),
    ("Sushi is delicious.", {"entities": [(0, 5, "FOOD")]}),
    ("Chaat.", {"entities": [(0, 5, "FOOD")]}),
    ("Dal Dhokli,", {"entities": [(0, 10, "FOOD")]}),
    ("Pongal.", {"entities": [(0, 6, "FOOD")]}),
    ("Biryani.", {"entities": [(0, 7, "FOOD")]}),
    ("Papaya Khar.", {"entities": [(0, 11, "FOOD")]}),
    ("Chowkha.", {"entities": [(0, 7, "FOOD")]}),
]

In [50]:
TRAIN_DATA

[('I love pizza.', {'entities': [(7, 12, 'FOOD')]}),
 ('Sushi is delicious.', {'entities': [(0, 5, 'FOOD')]}),
 ('Chaat.', {'entities': [(0, 5, 'FOOD')]}),
 ('Dal Dhokli,', {'entities': [(0, 10, 'FOOD')]}),
 ('Pongal.', {'entities': [(0, 6, 'FOOD')]}),
 ('Biryani.', {'entities': [(0, 7, 'FOOD')]}),
 ('Papaya Khar.', {'entities': [(0, 11, 'FOOD')]}),
 ('Chowkha.', {'entities': [(0, 7, 'FOOD')]})]

In [51]:
# Train the NER model
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    nlp.update([example], drop=0.5, losses={})


KeyError: "[E022] Could not find a transition with the name 'O' in the NER model."

In [None]:
text =  """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

In [53]:
nlp = spacy.load("en_core_web_sm")


text =  """Kiran want to know the famous foods in each state of India. So, he opened Google and search for this question. Google showed that
in Delhi it is Chaat, in Gujarat it is Dal Dhokli, in Tamilnadu it is Pongal, in Andhrapradesh it is Biryani, in Assam it is Papaya Khar,
in Bihar it is Litti Chowkha and so on for all other states"""

doc = nlp(text)

In [60]:
doc[32]

Chaat

In [63]:
doc[38:40]

Dal Dhokli

In [64]:
doc[45]

Pongal

In [66]:
doc[51]

Biryani

In [70]:
doc[57:59]

Papaya Khar

In [74]:
doc[65:67]

Litti Chowkha

In [75]:
from spacy.tokens import Span

s1 = Span(doc, 32, 33, label="FOOD")
s2 = Span(doc, 38, 40, label="FOOD")
s3 = Span(doc, 45, 46, label="FOOD")
s4 = Span(doc, 51, 52, label="FOOD")
s5 = Span(doc, 57, 59, label="FOOD")
s6 = Span(doc, 65, 67, label="FOOD")

In [76]:
doc.set_ents([s1, s2, s3, s4, s5, s6], default="unmodified")

In [77]:
for ent in doc.ents:
    print(ent.text, " | ",ent.label_," | ", spacy.explain(ent.label_))

Kiran  |  ORG  |  Companies, agencies, institutions, etc.
India  |  GPE  |  Countries, cities, states
Google  |  ORG  |  Companies, agencies, institutions, etc.
Google  |  ORG  |  Companies, agencies, institutions, etc.
Delhi  |  GPE  |  Countries, cities, states
Chaat  |  FOOD  |  None
Gujarat  |  GPE  |  Countries, cities, states
Dal Dhokli  |  FOOD  |  None
Tamilnadu  |  GPE  |  Countries, cities, states
Pongal  |  FOOD  |  None
Andhrapradesh  |  GPE  |  Countries, cities, states
Biryani  |  FOOD  |  None
Assam  |  GPE  |  Countries, cities, states
Papaya Khar  |  FOOD  |  None
Bihar  |  GPE  |  Countries, cities, states
Litti Chowkha  |  FOOD  |  None




In [78]:
Geographical_areas= []
for ent in doc.ents:
    if ent.label_ == "GPE":
        Geographical_areas.append(ent)

Geographical_areas = [str(area) for area in Geographical_areas]
print("Geographical ares in given text:")
print(", ".join(Geographical_areas))

Geographical ares in given text:
India, Delhi, Gujarat, Tamilnadu, Andhrapradesh, Assam, Bihar


Customized Named entity

https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718