## Example with NLTK


In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [5]:
# sample text
ex_text = "European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices"

In [7]:
# preprocess the sample text for tokenization and pos tagging
def preprocess(text):
    sent = nltk.word_tokenize(text)
    sent = nltk.pos_tag(sent)
    return sent

In [9]:
sent = preprocess(ex_text)
print("list of tuples containing the individual words in the sentence and their associated part-of-speech.")
sent

list of tuples containing the individual words in the sentence and their associated part-of-speech.


[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [10]:
# identify named entities 
# that a noun phrase, NP, should be formed whenever the chunker finds an optional 
# determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.
pattern = 'NP:{<DT>?<JJ>*<NN>}'
# create chunk parser with pattern using regex and use it for sampple example
chunk_parser = nltk.RegexpParser(pattern)
cs = chunk_parser.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [11]:
# IOB (Inside-Outside-Beginning) tags are standards to represent these chunks in form of tree
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tag = tree2conlltags(cs)
pprint(iob_tag)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [12]:
#With the function nltk.ne_chunk(), we can recognize named entities using a classifier, 
# the classifier adds category labels such as PERSON, ORGANIZATION, and GPE.
ne_tree  = nltk.ne_chunk(pos_tag(word_tokenize(ex_text)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


## Example with spaCy

In [14]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [17]:
doc = nlp("European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices")
pprint([(X.text, X.label_) for X in doc.ents])
print()
print("Description of entities:\nNORD:nationalities or religious or political groups, ORG: organization")

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]

Description of entities:
NORD:nationalities or religious or political groups, ORG: organization


In [18]:
# check token level entity annotation using BILUO (Begin In Last Unit(a signle token entity) Out(non-entity token))
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


## Extract named entity from article

In [19]:
from bs4 import BeautifulSoup
import requests
import re

In [21]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", "aside"]):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

article_url = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(article_url)
print("Total entities in article:", len(article.ents))

Total entities in article: 154


In [22]:
# print unique labels for all entities
labels = [X.label_ for X in article.ents]
Counter(labels)

Counter({'ORG': 38,
         'PERSON': 77,
         'DATE': 23,
         'GPE': 9,
         'NORP': 2,
         'CARDINAL': 3,
         'LOC': 1,
         'ORDINAL': 1})

In [24]:
# check for most frequent tokens
items = [X.text for X in article.ents]
Counter(items).most_common(5)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13), ('Russia', 6), ('Clinton', 5)]

In [29]:
# select random sentence from the article text
sentence = [x for x in article.sents]
print(sentence[15])

The report was critical of Mr. Strzok’s conduct in sending the texts, and the bureau’s Office of Professional Responsibility said that Mr. Strzok should be suspended for 60 days and demoted.


In [30]:
# generate raw markup using displacy render
displacy.render(nlp(str(sentence[15])), jupyter=True, style='ent')

In [31]:
# visualize sentence and dependencies
displacy.render(nlp(str(sentence[15])), style='dep', jupyter=True, options={'distance':120})

In [33]:
# extract part of speech with lemmatization of sentence
[(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentence[15])) if not y.is_stop and y.pos_!='PUNCT']]

[('report', 'NOUN', 'report'),
 ('critical', 'ADJ', 'critical'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('conduct', 'NOUN', 'conduct'),
 ('sending', 'VERB', 'send'),
 ('texts', 'NOUN', 'text'),
 ('bureau', 'NOUN', 'bureau'),
 ('Office', 'PROPN', 'Office'),
 ('Professional', 'PROPN', 'Professional'),
 ('Responsibility', 'PROPN', 'Responsibility'),
 ('said', 'VERB', 'say'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('suspended', 'VERB', 'suspend'),
 ('60', 'NUM', '60'),
 ('days', 'NOUN', 'day'),
 ('demoted', 'VERB', 'demote')]

In [41]:
# visualiaztion of entities for whole article
print("Entire article for entity visualization:")
displacy.render(nlp(str(sentence)), jupyter=True, style='ent')

Entire article for entity visualization:
