<a href="https://colab.research.google.com/github/SarthakKeshari/LearningNLP/blob/main/Named_Entity_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
#Write a fucnction to display basic entity info:
def show_ents(doc):
  if doc.ents:
    for ent in doc.ents:
      print(f'{ent.text:{10}} - {ent.label_:{5}} - {spacy.explain(ent.label_)}')
  else:
    print('No named entity found')

In [6]:
doc = nlp('Hello, My name is Sarthak')

show_ents(doc)

No named entity found


In [7]:
doc = nlp('Hello, I am in India since 1999')

show_ents(doc)

India      - GPE   - Countries, cities, states
1999       - DATE  - Absolute or relative dates or periods


In [14]:
doc = nlp(u'Can I please borrow Rs. 500 from Jim to busy some Google stock?')

for ent in doc.ents:
  print(ent.text,ent.start,ent.end,ent.start_char,ent.end_char,ent.label_)

500 6 7 24 27 CARDINAL
Jim 8 9 33 36 PERSON
Google 12 13 50 56 ORG


In [15]:
# Adding named entity to Span

doc = nlp(u'Sarthak to build an Indian factory for $20000')

show_ents(doc)

Indian     - NORP  - Nationalities or religious or political groups
20000      - MONEY - Monetary values, including unit


In [18]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
NAME = doc.vocab.strings[u'PERSON']

# Create a Span for the new entity
new_ent = Span(doc,0,1,label=NAME)

#Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]

show_ents(doc)

Sarthak    - PERSON - People, including fictional
Indian     - NORP  - Nationalities or religious or political groups
20000      - MONEY - Monetary values, including unit


In [19]:
#Adding Naming Entities to All Matching Spans

doc = nlp(u'Our company plans to introduce a new vacuum cleaner.'
u'If successful, the vacuum cleaner will be our first product')

show_ents(doc)

first      - ORDINAL - "first", "second", etc.


In [23]:
# Import PhraseMatcher and create matcher object

from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create the desired phrase patterns

phrase_list = ['vacuum cleaner','vacuum-cleaner']
phrase_patterns = [nlp(text) for text in phrase_list]

In [24]:
# Apply patterns to our matcher object

matcher.add('newproduct',None,*phrase_patterns)

# Apply the matcher to our doc object

matches = matcher(doc)

matches

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]

In [25]:
# Here we need to create Span from each match, and create named entities from them

from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT']

new_ents = [Span(doc,match[1],match[2],label=PROD) for match in matches]

doc.ents = list(doc.ents) + new_ents

show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first      - ORDINAL - "first", "second", etc.


In [26]:
# Counting Entities

doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

show_ents(doc)

29.50      - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


In [27]:
len([ent for ent in doc.ents if ent.label_=='MONEY'])

2

In [33]:
doc = nlp(u'Originally priced at $29.5.\n the sweater was marked down to five dollars.')

show_ents(doc)

len([ent for ent in doc.ents if ent.label_=='MONEY'])

29.5       - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


2

In [34]:
#Quick function to remove ents formed on whitespaces

def remove_whitespace_entities(doc):
  doc.ents = [e for e in doc.ents if not e.text.isspace()]
  return doc

#Insert this into the pipeline AFTER the ner component
nlp.add_pipe(remove_whitespace_entities,after='ner')

In [35]:
doc = nlp(u'Originally priced at $29.5.\n the sweater was marked down to five dollars.')

show_ents(doc)

29.5       - MONEY - Monetary values, including unit
five dollars - MONEY - Monetary values, including unit


NOUN CHUNKS

In [36]:
doc = nlp(u'Autonomous cars shift insurance liability toward manufacturers')

for chunk in doc.noun_chunks:
  print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


In [39]:
print(len(list(doc.noun_chunks)))

3


Visualizing Named Entities

In [43]:
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy import displacy

doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.'
u'By contrast, Sony sold only 7 thousand Walkman music players')

displacy.render(doc,style='ent',jupyter=True)

In [44]:
for sent in doc.sents:
  displacy.render(nlp(sent.text),style='ent',jupyter=True)

In [45]:
doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.'
u'By contrast, my kids sold a lot of lemonade')

for sent in doc2.sents:
  displacy.render(nlp(sent.text),style='ent',jupyter=True)

  "__main__", mod_spec)


In [47]:
for sent in doc2.sents:
  docx = nlp(sent.text)
  if docx.ents:
    displacy.render(docx,style='ent',jupyter=True)
  else:
    print(docx.text)

By contrast, my kids sold a lot of lemonade


Viewing specific Entities

In [48]:
options = {'ents':['ORG','PRODUCT']}

displacy.render(doc,style='ent',jupyter=True,options=options)

Customizing Colors and Effects

In [53]:
colors = {'ORG': 'linear-gradient(90deg,#abcdef,#012345)','PRODUCT':'radial-gradient(lightgreen,white)'}

options = {'ents':['ORG','PRODUCT'],'colors':colors}

displacy.render(doc,style='ent',jupyter=True,options=options)