## Named Entity Recognization with Spacy & NLTK
Named entity recognition (NER)is probably the first step towards information extraction that seeks to locate and classify named entities in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc.

Named Entity Recognition is the process of NLP which deals with identifying and classifying named entities. The raw and structured text is taken and named entities are classified into persons, organizations, places, money, time, etc. Basically, named entities are identified and segmented into various predefined classes.

In [7]:
# import libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
#nltk.download('averaged_perceptron_tagger')
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/karen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
#apply word tokenization and part-of-speech tagging to the sentence
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

implement noun phrase chunking to identify named entities using a regular expression consisting of rules that indicate how sentences should be chunked

Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [4]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [5]:
#chunking
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [6]:
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [None]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print (ne_tree)
# Google is recognized as a person

### use spacy

In [13]:
import spacy
from spacy import displacy
from collections import Counter
nlp = spacy.load("en_core_web_sm")
#python3 -m spacy download en_core_web_sm

In [62]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [15]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'O', ''),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [48]:
#Extracting named entity from an article
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.cnbc.com/2022/10/11/covid-news-us-is-tracking-omicron-subvariants-but-booster-should-protect.html')
article = nlp(ny_bb)
len(article.ents)

110

In [49]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 17,
         'ORG': 37,
         'CARDINAL': 7,
         'EVENT': 1,
         'PERSON': 20,
         'DATE': 19,
         'FAC': 1,
         'PERCENT': 2,
         'MONEY': 1,
         'NORP': 1,
         'ORDINAL': 2,
         'WORK_OF_ART': 1,
         'TIME': 1})

In [63]:
items = [x.text for x in article.ents]
#return 3 most frequent tokens
Counter(items).most_common(5)

[('U.S.', 9),
 ('Covid', 6),
 ('White House', 5),
 ('the White House', 5),
 ('Jha', 4)]

In [52]:
#randomly select one sentence to learn more
sentences = [x for x in article.sents]
print(sentences[10])

Pfizer's new shots are available for people ages 12 and up, while adults ages 18 and older are eligible for Moderna's boosters.


In [53]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [54]:
displacy.render(nlp(str(sentences[10])), jupyter=True, style='ent')

In [55]:
displacy.render(nlp(str(sentences[10])), style='dep', jupyter = True, options = {'distance': 120})

In [57]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[10])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Pfizer', 'PROPN', 'Pfizer'),
 ('new', 'ADJ', 'new'),
 ('shots', 'NOUN', 'shot'),
 ('available', 'ADJ', 'available'),
 ('people', 'NOUN', 'people'),
 ('ages', 'NOUN', 'age'),
 ('12', 'NUM', '12'),
 ('adults', 'NOUN', 'adult'),
 ('ages', 'NOUN', 'age'),
 ('18', 'NUM', '18'),
 ('older', 'ADJ', 'old'),
 ('eligible', 'ADJ', 'eligible'),
 ('Moderna', 'PROPN', 'Moderna'),
 ('boosters', 'NOUN', 'booster')]

In [58]:
dict([(str(x), x.label_) for x in nlp(str(sentences[10])).ents])

{'Pfizer': 'PERSON', 'ages 12': 'DATE', 'ages 18': 'DATE', 'Moderna': 'PERSON'}

In [60]:
displacy.render(nlp(str(sentences[0:])), jupyter=True, style='ent')
# CNBC is not correctly recognized