In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


I took a sentence from The New York Times, “Democrats pressed past Republicans’ objections to remove the Georgia freshman from her two committee posts in a vote without precedent in the modern Congress.”


In [2]:
sentence = 'Democrats pressed past Republicans’ objections to remove the Georgia freshman from her two committee posts in a vote without precedent in the modern Congress'

Then we apply word tokenization and part-of-speech tagging to the sentence.

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

Let’s see what we get:

In [4]:
sent = preprocess(sentence)
sent

[('Democrats', 'NNPS'),
 ('pressed', 'VBD'),
 ('past', 'JJ'),
 ('Republicans', 'NNPS'),
 ('’', 'VBP'),
 ('objections', 'NNS'),
 ('to', 'TO'),
 ('remove', 'VB'),
 ('the', 'DT'),
 ('Georgia', 'NNP'),
 ('freshman', 'NN'),
 ('from', 'IN'),
 ('her', 'PRP$'),
 ('two', 'CD'),
 ('committee', 'NN'),
 ('posts', 'NNS'),
 ('in', 'IN'),
 ('a', 'DT'),
 ('vote', 'NN'),
 ('without', 'IN'),
 ('precedent', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('modern', 'JJ'),
 ('Congress', 'NNP')]

Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.

In [5]:
np = 'NP: {<DT>?<JJ>*<NN>}'

# Chunking


Using this pattern, we create a chunk parser and test it on our sentence.


In [6]:
regex = nltk.RegexpParser(np)
parsent = regex.parse(sent)
print(parsent)

(S
  Democrats/NNPS
  pressed/VBD
  past/JJ
  Republicans/NNPS
  ’/VBP
  objections/NNS
  to/TO
  remove/VB
  the/DT
  Georgia/NNP
  (NP freshman/NN)
  from/IN
  her/PRP$
  two/CD
  (NP committee/NN)
  posts/NNS
  in/IN
  (NP a/DT vote/NN)
  without/IN
  (NP precedent/NN)
  in/IN
  the/DT
  modern/JJ
  Congress/NNP)


IOB representation 


In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(parsent)
pprint(iob_tagged)

[('Democrats', 'NNPS', 'O'),
 ('pressed', 'VBD', 'O'),
 ('past', 'JJ', 'O'),
 ('Republicans', 'NNPS', 'O'),
 ('’', 'VBP', 'O'),
 ('objections', 'NNS', 'O'),
 ('to', 'TO', 'O'),
 ('remove', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('Georgia', 'NNP', 'O'),
 ('freshman', 'NN', 'B-NP'),
 ('from', 'IN', 'O'),
 ('her', 'PRP$', 'O'),
 ('two', 'CD', 'O'),
 ('committee', 'NN', 'B-NP'),
 ('posts', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('vote', 'NN', 'I-NP'),
 ('without', 'IN', 'O'),
 ('precedent', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('modern', 'JJ', 'O'),
 ('Congress', 'NNP', 'O')]


In this representation, there is one token per line, each with its part-of-speech tag and its named entity tag. Based on this training corpus, we can construct a tagger that can be used to label new sentences.; and use the nltk.chunk.conlltags2tree() function to convert the tag sequences into a chunk tree.


In [8]:
tree = conlltags2tree(iob_tagged)
print(tree)

(S
  Democrats/NNPS
  pressed/VBD
  past/JJ
  Republicans/NNPS
  ’/VBP
  objections/NNS
  to/TO
  remove/VB
  the/DT
  Georgia/NNP
  (NP freshman/NN)
  from/IN
  her/PRP$
  two/CD
  (NP committee/NN)
  posts/NNS
  in/IN
  (NP a/DT vote/NN)
  without/IN
  (NP precedent/NN)
  in/IN
  the/DT
  modern/JJ
  Congress/NNP)


With the function nltk.ne_chunk(), we can recognize named entities using a classifier, the classifier adds category labels such as PERSON, ORGANIZATION, and GPE.

In [9]:
from nltk.chunk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(ne_tree)

(S
  Democrats/NNPS
  pressed/VBD
  past/JJ
  Republicans/NNPS
  ’/VBP
  objections/NNS
  to/TO
  remove/VB
  the/DT
  (GPE Georgia/NNP)
  freshman/NN
  from/IN
  her/PRP$
  two/CD
  committee/NN
  posts/NNS
  in/IN
  a/DT
  vote/NN
  without/IN
  precedent/NN
  in/IN
  the/DT
  modern/JJ
  (ORGANIZATION Congress/NNP))


# SPACY

In [10]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

We are using the same sentence, “Democrats pressed past Republicans’ objections to remove the Georgia freshman from her two committee posts in a vote without precedent in the modern Congress.”

One of the nice things about Spacy is that we only need to apply nlp once, the entire background pipeline will return the objects.

In [11]:
doc = nlp('Democrats pressed past Republicans’ objections to remove the Georgia freshman from her two committee posts in a vote without precedent in the modern Congress')
pprint([(X.text,X.label_) for X in doc.ents])

[('Democrats', 'NORP'),
 ('Republicans', 'NORP'),
 ('Georgia', 'GPE'),
 ('two', 'CARDINAL'),
 ('Congress', 'ORG')]


Democrats and Republicans are NORP (nationalities or religious or political groups), Georgia is a state, two is a numeric value and Congress is an organization. They are all correct.

In [12]:
for w in doc:
    print(w.text,w.pos_)

Democrats PROPN
pressed VERB
past ADP
Republicans PROPN
’ PART
objections NOUN
to PART
remove VERB
the DET
Georgia PROPN
freshman NOUN
from ADP
her DET
two NUM
committee NOUN
posts NOUN
in ADP
a DET
vote NOUN
without ADP
precedent NOUN
in ADP
the DET
modern ADJ
Congress PROPN


In [13]:
spacy.explain('NORP')

'Nationalities or religious or political groups'

In [14]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

# Token


During the above example, we were working on entity level, in the following example, we are demonstrating token-level entity annotation using the BILUO tagging scheme to describe the entity boundaries.


In [15]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Democrats, 'B', 'NORP'),
 (pressed, 'O', ''),
 (past, 'O', ''),
 (Republicans, 'B', 'NORP'),
 (’, 'O', ''),
 (objections, 'O', ''),
 (to, 'O', ''),
 (remove, 'O', ''),
 (the, 'O', ''),
 (Georgia, 'B', 'GPE'),
 (freshman, 'O', ''),
 (from, 'O', ''),
 (her, 'O', ''),
 (two, 'B', 'CARDINAL'),
 (committee, 'O', ''),
 (posts, 'O', ''),
 (in, 'O', ''),
 (a, 'O', ''),
 (vote, 'O', ''),
 (without, 'O', ''),
 (precedent, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (modern, 'O', ''),
 (Congress, 'B', 'ORG')]


# Extracting named entity from an article


Now let’s get serious with SpaCy and extracting named entities from a New York Times article, — “F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired.”

In [16]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2021/02/04/us/politics/biden-foreign-policy.html?action=click&module=Spotlight&pgtype=Homepage')
article = nlp(ny_bb)
len(article.ents)

272

There are 272 entities in the article and they are represented as 12 unique labels:


In [23]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'DATE': 34,
         'PRODUCT': 2,
         'ORG': 35,
         'GPE': 75,
         'PERSON': 54,
         'CARDINAL': 11,
         'NORP': 49,
         'MONEY': 2,
         'ORDINAL': 6,
         'LOC': 2,
         'TIME': 1,
         'FAC': 1})

The following are three most frequent tokens.


In [24]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('American', 26), ('Biden', 25), ('Yemen', 15)]

Let’s randomly select one sentence to learn more.

In [25]:
sentences = [x for x in article.sents]
print(sentences[20])

Soon after Iran-allied Houthi forces took over Yemen’s capital, Sana, in the fall of 2014, the Saudis and their gulf allies began airstrikes and then bought billions of dollars in American weaponry, with the goal of ousting the Houthi rebels from northern Yemen.


Let’s run displacy.render to generate the raw markup.

In [26]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

One miss-classification here is Sana.

Using spaCy’s built-in displaCy visualizer, here’s what the above sentence and its dependencies look like:

In [31]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 100})


Next, we verbatim, extract part-of-speech and lemmatize this sentence.

In [34]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Soon', 'ADV', 'soon'),
 ('Iran', 'PROPN', 'Iran'),
 ('allied', 'PROPN', 'allied'),
 ('Houthi', 'PROPN', 'Houthi'),
 ('forces', 'NOUN', 'force'),
 ('took', 'VERB', 'take'),
 ('Yemen', 'PROPN', 'Yemen'),
 ('capital', 'NOUN', 'capital'),
 ('Sana', 'PROPN', 'Sana'),
 ('fall', 'NOUN', 'fall'),
 ('2014', 'NUM', '2014'),
 ('Saudis', 'PROPN', 'Saudis'),
 ('gulf', 'PROPN', 'gulf'),
 ('allies', 'NOUN', 'ally'),
 ('began', 'VERB', 'begin'),
 ('airstrikes', 'NOUN', 'airstrike'),
 ('bought', 'VERB', 'buy'),
 ('billions', 'NOUN', 'billion'),
 ('dollars', 'NOUN', 'dollar'),
 ('American', 'ADJ', 'american'),
 ('weaponry', 'NOUN', 'weaponry'),
 ('goal', 'NOUN', 'goal'),
 ('ousting', 'VERB', 'oust'),
 ('Houthi', 'PROPN', 'Houthi'),
 ('rebels', 'NOUN', 'rebel'),
 ('northern', 'ADJ', 'northern'),
 ('Yemen', 'PROPN', 'Yemen')]

In [35]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Iran': 'GPE',
 'Houthi': 'ORG',
 'Yemen': 'GPE',
 'Sana': 'PERSON',
 'the fall of 2014': 'DATE',
 'Saudis': 'NORP',
 'billions of dollars': 'MONEY',
 'American': 'NORP'}

Named entity extraction are correct except “Sana”.

In [36]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(Soon, 'O', ''), (after, 'O', ''), (Iran, 'B', 'GPE'), (-, 'O', ''), (allied, 'O', ''), (Houthi, 'B', 'ORG'), (forces, 'O', ''), (took, 'O', ''), (over, 'O', ''), (Yemen, 'B', 'GPE'), (’s, 'O', ''), (capital, 'O', ''), (,, 'O', ''), (Sana, 'B', 'PERSON'), (,, 'O', ''), (in, 'O', ''), (the, 'B', 'DATE'), (fall, 'I', 'DATE'), (of, 'I', 'DATE'), (2014, 'I', 'DATE'), (,, 'O', ''), (the, 'O', ''), (Saudis, 'B', 'NORP'), (and, 'O', ''), (their, 'O', ''), (gulf, 'O', ''), (allies, 'O', ''), (began, 'O', ''), (airstrikes, 'O', ''), (and, 'O', ''), (then, 'O', ''), (bought, 'O', ''), (billions, 'B', 'MONEY'), (of, 'I', 'MONEY'), (dollars, 'I', 'MONEY'), (in, 'O', ''), (American, 'B', 'NORP'), (weaponry, 'O', ''), (,, 'O', ''), (with, 'O', ''), (the, 'O', ''), (goal, 'O', ''), (of, 'O', ''), (ousting, 'O', ''), (the, 'O', ''), (Houthi, 'B', 'ORG'), (rebels, 'O', ''), (from, 'O', ''), (northern, 'O', ''), (Yemen, 'B', 'GPE'), (., 'O', '')]


Finally, we visualize the entity of the entire article.


In [37]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')