In [1]:
import spacy
from spacy import displacy
from collections import Counter
from pprint import pprint
nlp = spacy.load('en_core_web_sm')

In [2]:
text = """For nearly a decade, they argue, Facebook has made “serial defensive acquisitions” to protect its dominant position in the market for social networks, according to slides they have shown government officials. Scooping up nascent rivals, they assert, can allow Facebook to charge advertisers higher prices and can give users worse experience."""

In [3]:
file = nlp(text)
pprint([(token.text,token.label_) for token in file.ents])

[('nearly a decade', 'DATE'), ('Facebook', 'ORG'), ('Facebook', 'ORG')]


In [4]:
for token in file:
    print("{} : {}".format(token.text,spacy.explain(token.tag_)))

For : conjunction, subordinating or preposition
nearly : adverb
a : determiner
decade : noun, singular or mass
, : punctuation mark, comma
they : pronoun, personal
argue : verb, non-3rd person singular present
, : punctuation mark, comma
Facebook : noun, proper singular
has : verb, 3rd person singular present
made : verb, past participle
“ : opening quotation mark
serial : adjective
defensive : adjective
acquisitions : noun, plural
” : closing quotation mark
to : infinitival to
protect : verb, base form
its : pronoun, possessive
dominant : adjective
position : noun, singular or mass
in : conjunction, subordinating or preposition
the : determiner
market : noun, singular or mass
for : conjunction, subordinating or preposition
social : adjective
networks : noun, plural
, : punctuation mark, comma
according : verb, gerund or present participle
to : conjunction, subordinating or preposition
slides : noun, plural
they : pronoun, personal
have : verb, non-3rd person singular present
shown : v

In [5]:
pprint([(token,token.ent_iob_,token.ent_type_) for token in file])

[(For, 'O', ''),
 (nearly, 'B', 'DATE'),
 (a, 'I', 'DATE'),
 (decade, 'I', 'DATE'),
 (,, 'O', ''),
 (they, 'O', ''),
 (argue, 'O', ''),
 (,, 'O', ''),
 (Facebook, 'B', 'ORG'),
 (has, 'O', ''),
 (made, 'O', ''),
 (“, 'O', ''),
 (serial, 'O', ''),
 (defensive, 'O', ''),
 (acquisitions, 'O', ''),
 (”, 'O', ''),
 (to, 'O', ''),
 (protect, 'O', ''),
 (its, 'O', ''),
 (dominant, 'O', ''),
 (position, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (market, 'O', ''),
 (for, 'O', ''),
 (social, 'O', ''),
 (networks, 'O', ''),
 (,, 'O', ''),
 (according, 'O', ''),
 (to, 'O', ''),
 (slides, 'O', ''),
 (they, 'O', ''),
 (have, 'O', ''),
 (shown, 'O', ''),
 (government, 'O', ''),
 (officials, 'O', ''),
 (., 'O', ''),
 (Scooping, 'O', ''),
 (up, 'O', ''),
 (nascent, 'O', ''),
 (rivals, 'O', ''),
 (,, 'O', ''),
 (they, 'O', ''),
 (assert, 'O', ''),
 (,, 'O', ''),
 (can, 'O', ''),
 (allow, 'O', ''),
 (Facebook, 'B', 'ORG'),
 (to, 'O', ''),
 (charge, 'O', ''),
 (advertisers, 'O', ''),
 (higher, 'O', ''),


In [6]:
print("Extracing named entity form an article - Source code : https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da")

from bs4 import BeautifulSoup
import requests
import re

Extracing named entity form an article - Source code : https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da


In [7]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))


In [8]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')

In [9]:
article = nlp(ny_bb)
len(article.ents)

192

In [10]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 83,
         'GPE': 34,
         'ORG': 26,
         'DATE': 32,
         'CARDINAL': 6,
         'EVENT': 1,
         'NORP': 5,
         'ORDINAL': 1,
         'LAW': 1,
         'WORK_OF_ART': 1,
         'PRODUCT': 2})

In [11]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10), ('Russia', 6), ('Clinton', 5)]

In [12]:
sentences = [x for x in article.sents]
print(sentences[22])

The decision to fire Special Agent Strzok is not only a departure from typical bureau practice, but also contradicts Director Wray’s testimony to Congress and his assurances that the F.B.I. intended to follow its regular process in this and all personnel matters,” Mr. Goelman said.“This decision should be deeply troubling to all Americans,” Mr. Goelman added.


In [13]:
displacy.render(nlp(str(sentences[22])), jupyter=True, style='ent')

In [14]:
displacy.render(nlp(str(sentences[22])), style='dep', jupyter = True, options = {'distance': 120})

In [15]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[22])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('The', 'DET', 'the'),
 ('decision', 'NOUN', 'decision'),
 ('fire', 'VERB', 'fire'),
 ('Special', 'PROPN', 'special'),
 ('Agent', 'PROPN', 'agent'),
 ('Strzok', 'PROPN', 'strzok'),
 ('departure', 'NOUN', 'departure'),
 ('typical', 'ADJ', 'typical'),
 ('bureau', 'NOUN', 'bureau'),
 ('practice', 'NOUN', 'practice'),
 ('contradicts', 'VERB', 'contradict'),
 ('Director', 'PROPN', 'director'),
 ('Wray', 'PROPN', 'wray'),
 ('’s', 'PART', '’s'),
 ('testimony', 'NOUN', 'testimony'),
 ('Congress', 'PROPN', 'congress'),
 ('assurances', 'NOUN', 'assurance'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('intended', 'VERB', 'intend'),
 ('follow', 'VERB', 'follow'),
 ('regular', 'ADJ', 'regular'),
 ('process', 'NOUN', 'process'),
 ('personnel', 'NOUN', 'personnel'),
 ('matters', 'NOUN', 'matter'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Goelman', 'PROPN', 'goelman'),
 ('said.“This', 'DET', 'said.“this'),
 ('decision', 'NOUN', 'decision'),
 ('deeply', 'ADV', 'deeply'),
 ('troubling', 'ADJ', 'troubling'),
 ('Americans', '

In [16]:
dict([(str(x), x.label_) for x in nlp(str(sentences[22])).ents])

{'Special Agent Strzok': 'ORG',
 'Wray': 'PERSON',
 'Congress': 'ORG',
 'F.B.I.': 'GPE',
 'Goelman': 'PERSON',
 'Americans': 'NORP'}

In [17]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[22]])

[(The, 'O', ''), (decision, 'O', ''), (to, 'O', ''), (fire, 'O', ''), (Special, 'B', 'ORG'), (Agent, 'I', 'ORG'), (Strzok, 'I', 'ORG'), (is, 'O', ''), (not, 'O', ''), (only, 'O', ''), (a, 'O', ''), (departure, 'O', ''), (from, 'O', ''), (typical, 'O', ''), (bureau, 'O', ''), (practice, 'O', ''), (,, 'O', ''), (but, 'O', ''), (also, 'O', ''), (contradicts, 'O', ''), (Director, 'O', ''), (Wray, 'B', 'PERSON'), (’s, 'O', ''), (testimony, 'O', ''), (to, 'O', ''), (Congress, 'B', 'ORG'), (and, 'O', ''), (his, 'O', ''), (assurances, 'O', ''), (that, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'GPE'), (intended, 'O', ''), (to, 'O', ''), (follow, 'O', ''), (its, 'O', ''), (regular, 'O', ''), (process, 'O', ''), (in, 'O', ''), (this, 'O', ''), (and, 'O', ''), (all, 'O', ''), (personnel, 'O', ''), (matters, 'O', ''), (,, 'O', ''), (”, 'O', ''), (Mr., 'O', ''), (Goelman, 'B', 'PERSON'), (said.“This, 'O', ''), (decision, 'O', ''), (should, 'O', ''), (be, 'O', ''), (deeply, 'O', ''), (troubling, 'O', '

In [18]:
displacy.render(article, jupyter=True, style='ent')