### classify named entities in text into pre-defined categories such as the names of persons,
### organizations, locations, expressions of times, quantities, monetary values, percentages, etc. 

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [47]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [48]:
print(ex)

European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices


In [49]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)  ## return words from sentences
    sent = nltk.pos_tag(sent)
    return sent

In [50]:
sent = preprocess(ex)

In [51]:
print(sent)

[('European', 'JJ'), ('authorities', 'NNS'), ('fined', 'VBD'), ('Google', 'NNP'), ('a', 'DT'), ('record', 'NN'), ('$', '$'), ('5.1', 'CD'), ('billion', 'CD'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('in', 'IN'), ('the', 'DT'), ('mobile', 'JJ'), ('phone', 'NN'), ('market', 'NN'), ('and', 'CC'), ('ordered', 'VBD'), ('the', 'DT'), ('company', 'NN'), ('to', 'TO'), ('alter', 'VB'), ('its', 'PRP$'), ('practices', 'NNS')]


In [52]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [53]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [54]:
# IOB tag

In [55]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [61]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [63]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_md
nlp = en_core_web_md.load()

In [64]:
doc = nlp(ex)

In [68]:
ex

'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [66]:
print([(X.text, X.label_) for X in doc.ents])
# NORP --> nationalities or religious or political groups

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [67]:
# "B" means the token begins an entity, "I" means it is inside an entity,
# "O" means it is outside an entity, and "" means no entity tag is set.

In [69]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'O', ''), (record, 'O', ''), ($, 'B', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [73]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')


179

In [74]:
ny_bb

" F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times SectionsSEARCHSkip to contentSkip to site indexPoliticsLog InLog InToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredAdvertisementSupported byF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredImagePeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.CreditCreditT.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigations, has been fired for violating bureau policies, Mr. Strzok’s lawyer said Monday.Mr. Trump and his allies seized on the texts — exchanged during the 2016 campaign with a former

In [82]:
article

 F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times SectionsSEARCHSkip to contentSkip to site indexPoliticsLog InLog InToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredAdvertisementSupported byF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredImagePeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.CreditCreditT.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigations, has been fired for violating bureau policies, Mr. Strzok’s lawyer said Monday.Mr. Trump and his allies seized on the texts — exchanged during the 2016 campaign with a former 

In [87]:
article = nlp(ny_bb)  ## Converting into nlp object
article.ents  ##  extracting entities

(Peter Strzok,
 Trump,
 Texts,
 The New York Times,
 InLog InToday,
 Peter Strzok,
 Trump,
 Texts,
 Peter Strzok,
 Trump,
 Texts,
 Strzok,
 F.B.I.,
 Trump,
 CreditCreditT.J. Kirkpatrick,
 New York,
 Adam Goldman,
 Michael S. SchmidtAug,
 Peter Strzok,
 F.B.I.,
 Trump,
 Hillary Clinton,
 Russia,
 Strzok,
 Monday,
 Trump,
 2016,
 F.B.I.,
 Lisa Page,
 Russia,
 Strzok,
 20 years,
 F.B.I.,
 the early months,
 Strzok,
 F.B.I.,
 Trump,
 Strzok,
 last summer,
 Robert S. Mueller III,
 Strzok,
 Twitter,
 Monday,
 Trump,
 June,
 Strzok,
 F.B.I.,
 Hillary Clinton’s,
 2016,
 Strzok,
 Office of Professional Responsibility,
 Strzok,
 60 days,
 Strzok,
 House,
 July,
 Strzok,
 F.B.I.,
 David Bowdich,
 the Office of Professional Responsibility,
 Strzok,
 F.B.I.,
 Strzok,
 Strzok,
 Trump,
 F.B.I.,
 Bowdich,
 F.B.I.,
 Christopher A. Wray,
 Aitan Goelman,
 Strzok,
 Strzok,
 Congress,
 F.B.I.,
 Goelman,
 Americans,
 Goelman,
 Strzok,
 Strzok,
 Page,
 Trump,
 Page,
 Trump,
 Strzok,
 Michael E. Horowitz,
 St

In [97]:
labels = [x.label_ for x in article.ents]  ## x.label_ returns the label : 
print(labels)
print(Counter(labels)) ## Print the count of distinct labels

['PERSON', 'PERSON', 'WORK_OF_ART', 'ORG', 'PERSON', 'PERSON', 'PERSON', 'WORK_OF_ART', 'PERSON', 'PERSON', 'WORK_OF_ART', 'PERSON', 'ORG', 'PERSON', 'PERSON', 'GPE', 'PERSON', 'PERSON', 'PERSON', 'ORG', 'PERSON', 'PERSON', 'GPE', 'PERSON', 'DATE', 'PERSON', 'DATE', 'ORG', 'PERSON', 'GPE', 'PERSON', 'DATE', 'ORG', 'DATE', 'PERSON', 'ORG', 'PERSON', 'PERSON', 'DATE', 'PERSON', 'PERSON', 'ORG', 'DATE', 'PERSON', 'DATE', 'PERSON', 'ORG', 'PERSON', 'DATE', 'PERSON', 'ORG', 'PERSON', 'DATE', 'PERSON', 'ORG', 'DATE', 'PERSON', 'ORG', 'PERSON', 'ORG', 'PERSON', 'ORG', 'PERSON', 'PERSON', 'PERSON', 'ORG', 'PERSON', 'ORG', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', 'PERSON', 'NORP', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'DATE', 'DATE', 'PERSON', 'CARDINAL', 'DATE', 'PERSON', 'PERSON', 'ORG', 'PERSON', 'PRODUCT', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERSON', 'GPE', 'PERSON', 'PERSON', 'PERSON', 'PERS

<img src="sample.png">

In [101]:
items = [x.text for x in article.ents]  ## Returns the most common entity
Counter(items).most_common(3)

[('Strzok', 33), ('F.B.I.', 18), ('Trump', 16)]

In [102]:
sentences = [x for x in article.sents]
print(sentences[20])

victory traces back to June, when Mr. Strzok’s conduct was laid out in a wide-ranging inspector general’s report on how the F.B.I. handled the investigation of Hillary Clinton’s emails in the run-up to the 2016 election.


In [105]:
displacy.render(nlp(str(article)), jupyter=True, style='ent')

In [106]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('victory', 'NOUN', 'victory'),
 ('traces', 'VERB', 'trace'),
 ('June', 'PROPN', 'June'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('’s', 'PART', '’s'),
 ('conduct', 'NOUN', 'conduct'),
 ('laid', 'VERB', 'lay'),
 ('wide', 'ADV', 'wide'),
 ('ranging', 'VERB', 'range'),
 ('inspector', 'NOUN', 'inspector'),
 ('general', 'NOUN', 'general'),
 ('’s', 'PART', '’s'),
 ('report', 'NOUN', 'report'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('handled', 'VERB', 'handle'),
 ('investigation', 'NOUN', 'investigation'),
 ('Hillary', 'PROPN', 'Hillary'),
 ('Clinton', 'PROPN', 'Clinton'),
 ('’s', 'PART', '’s'),
 ('emails', 'NOUN', 'email'),
 ('run', 'NOUN', 'run'),
 ('2016', 'NUM', '2016'),
 ('election', 'NOUN', 'election')]

In [107]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'June': 'DATE',
 'Strzok': 'PERSON',
 'F.B.I.': 'ORG',
 'Hillary Clinton’s': 'PERSON',
 '2016': 'DATE'}