In [0]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


In [0]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.chunk import ne_chunk

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [0]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [0]:
sent = nltk.pos_tag(nltk.word_tokenize(ex))
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [0]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [0]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [0]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


1.	CC	Coordinating conjunction
	2.	CD	Cardinal number
	3.	DT	Determiner
	4.	EX	Existential there
	5.	FW	Foreign word
	6.	IN	Preposition or subordinating conjunction
	7.	JJ	Adjective
	8.	JJR	Adjective, comparative
	9.	JJS	Adjective, superlative
	10.	LS	List item marker
	11.	MD	Modal
	12.	NN	Noun, singular or mass
	13.	NNS	Noun, plural
	14.	NNP	Proper noun, singular
	15.	NNPS	Proper noun, plural
	16.	PDT	Predeterminer
	17.	POS	Possessive ending
	18.	PRP	Personal pronoun
	19.	PRP	Possessive pronoun
	20.	RB	Adverb
	21.	RBR	Adverb, comparative
	22.	RBS	Adverb, superlative
	23.	RP	Particle
	24.	SYM	Symbol
	25.	TO	to
	26.	UH	Interjection
	27.	VB	Verb, base form
	28.	VBD	Verb, past tense
	29.	VBG	Verb, gerund or present participle
	30.	VBN	Verb, past participle
	31.	VBP	Verb, non-3rd person singular present
	32.	VBZ	Verb, 3rd person singular present
	33.	WDT	Wh-determiner
	34.	WP	Wh-pronoun
	35.	WP	Possessive wh-pronoun
	36.	WRB	Wh-adverb 
  
  
  

In [0]:
sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [0]:
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print (ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


# Using SpaCy’s named entity recognition 

In [0]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [0]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [0]:
for token in doc:
    print(token.text, token.pos_, token.dep_)

European ADJ amod
authorities NOUN nsubj
fined VERB ROOT
Google PROPN dative
a DET det
record NOUN dobj
$ SYM quantmod
5.1 NUM compound
billion NUM nummod
on ADP prep
Wednesday PROPN pobj
for ADP prep
abusing VERB pcomp
its ADJ poss
power NOUN dobj
in ADP prep
the DET det
mobile ADJ amod
phone NOUN compound
market NOUN pobj
and CCONJ cc
ordered VERB conj
the DET det
company NOUN dobj
to PART aux
alter VERB xcomp
its ADJ poss
practices NOUN dobj


In [0]:
pprint([(token, token.ent_iob_, token.ent_type_) for token in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


# Extracting named entity from online article

In [0]:
from bs4 import BeautifulSoup
import requests
import re

In [0]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [0]:
ny_bb = url_to_string('https://www.nytimes.com/2019/01/02/business/dealbook/wall-street-2019-predictions.html?action=click&module=Briefings&pgtype=Homepage')
article = nlp(ny_bb)
len(article.ents)

287

In [0]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 12,
         'DATE': 57,
         'EVENT': 3,
         'GPE': 50,
         'LOC': 9,
         'MONEY': 6,
         'NORP': 23,
         'ORDINAL': 3,
         'ORG': 65,
         'PERCENT': 6,
         'PERSON': 48,
         'PRODUCT': 2,
         'QUANTITY': 1,
         'TIME': 2})

In [0]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('China', 12), ('2019', 8), ('2018', 7)]

In [0]:
sentences = [x for x in article.sents]
print(sentences[20])

But as the trade war with China escalated, his proclamations began to make investors jumpy.


In [0]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [0]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [0]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('But', 'CCONJ', 'but'),
 ('trade', 'NOUN', 'trade'),
 ('war', 'NOUN', 'war'),
 ('China', 'PROPN', 'china'),
 ('escalated', 'VERB', 'escalate'),
 ('proclamations', 'NOUN', 'proclamation'),
 ('began', 'VERB', 'begin'),
 ('investors', 'NOUN', 'investor'),
 ('jumpy', 'ADJ', 'jumpy')]

In [0]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'China': 'GPE'}

In [0]:
pprint([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(But, 'O', ''),
 (as, 'O', ''),
 (the, 'O', ''),
 (trade, 'O', ''),
 (war, 'O', ''),
 (with, 'O', ''),
 (China, 'B', 'GPE'),
 (escalated, 'O', ''),
 (,, 'O', ''),
 (his, 'O', ''),
 (proclamations, 'O', ''),
 (began, 'O', ''),
 (to, 'O', ''),
 (make, 'O', ''),
 (investors, 'O', ''),
 (jumpy, 'O', ''),
 (., 'O', '')]


In [0]:
displacy.render(article, jupyter=True, style='ent')