In [1]:
import nltk
import spacy

In [2]:
from spacy import displacy
from collections import Counter
from pprint import pprint
nlp=spacy.load('en_core_web_sm')

In [3]:
from bs4 import BeautifulSoup
import requests
import re

######  scrape the webpage in BeautifulSoup 

In [44]:
#Using the web scraping technique with BeautifulSoup
def _scrape_webpage(url):
       
    request = requests.get(url)
    html = request.text
    soup = BeautifulSoup(html,'html5lib')
    
    for script in soup(["script","style","aside"]):
        script.extract()
    
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [45]:
news_web =_scrape_webpage('https://www.nytimes.com/2020/03/31/world/coronavirus-live-news-updates.html?action=click&module=Spotlight&pgtype=Homepage')
text = nlp(news_web)
print(news_web[:5000])

     Grim Models Project High U.S. Toll - The New York Times                                                                                           SectionsSEARCHSkip to contentSkip to site indexWorldLog InLog InToday’s PaperWorld|White House Projects Grim Toll From Coronavirushttps://nyti.ms/3aEdYAY                               The Coronavirus Outbreak                                                                                                             •                                   Latest Updates                                                                               Guide to Financial Help                                                                    Virus Tracker                                                                    Common Questions                                                                    What You Can Do                                                                    Newsletter                                         Advertisement

### Count all the named entities in the document

In [6]:
#Number of named entities
len(text.ents)


667

In [7]:
text

     Grim Models Project High U.S. Toll - The New York Times                                                                                           SectionsSEARCHSkip to contentSkip to site indexWorldLog InLog InToday’s PaperWorld|White House Projects Grim Toll From Coronavirushttps://nyti.ms/3aEdYAY                               The Coronavirus Outbreak                                                                                                             •                                   Latest Updates                                                                               Guide to Financial Help                                                                    Virus Tracker                                                                    Common Questions                                                                    What You Can Do                                                                    Newsletter                                         Advertisement

In [8]:
labels = [x.label_ for x in text.ents]
Counter(labels)

Counter({'ORG': 70,
         'PERSON': 146,
         'CARDINAL': 79,
         'NORP': 55,
         'PERCENT': 9,
         'DATE': 95,
         'GPE': 150,
         'FAC': 2,
         'ORDINAL': 11,
         'TIME': 18,
         'PRODUCT': 2,
         'QUANTITY': 5,
         'MONEY': 8,
         'LOC': 11,
         'LAW': 4,
         'WORK_OF_ART': 2})

### Count the most frequent tokens for the entire document

In [9]:
items = [x.text for x in text.ents]
Counter(items).most_common(15)

[('Trump', 28),
 ('Tuesday', 20),
 ('China', 15),
 ('Americans', 14),
 ('U.S.', 10),
 ('American', 10),
 ('the United States', 9),
 ('Iran', 8),
 ('first', 7),
 ('one', 7),
 ('Europe', 7),
 ('New York', 6),
 ('Las Vegas', 5),
 ('two weeks', 5),
 ('Putin', 5)]

####  Pick a random integer K using Python random module, then pick three consecutive sentences starting with Kth, and print
####  these sentences. Note that you must make sure all picked sentences are in the document.

In [22]:
k = 30 
sentences = [x for x in text.sents]
print(sentences[k])

Anthony S. Fauci, the nation’s leading infectious disease expert, and Dr. Deborah L. Birx, who is coordinating the coronavirus response, displayed that grim projection at the White House on Tuesday, calling it “our real number” but pledging to do everything possible to reduce those numbers even further.


### Extract part-of-speech and lemmatize these consecutive sentences

In [23]:
[(x.orth_, x.pos_, x.lemma_) for x in [y
                                    for y
                                    in nlp(str(sentences[k]))
                                     if not y.is_stop and y.pos_ != 'PUNCT']] 

[('Anthony', 'PROPN', 'Anthony'),
 ('S.', 'PROPN', 'S.'),
 ('Fauci', 'PROPN', 'Fauci'),
 ('nation', 'NOUN', 'nation'),
 ('leading', 'VERB', 'lead'),
 ('infectious', 'ADJ', 'infectious'),
 ('disease', 'NOUN', 'disease'),
 ('expert', 'NOUN', 'expert'),
 ('Dr.', 'PROPN', 'Dr.'),
 ('Deborah', 'PROPN', 'Deborah'),
 ('L.', 'PROPN', 'L.'),
 ('Birx', 'PROPN', 'Birx'),
 ('coordinating', 'VERB', 'coordinate'),
 ('coronavirus', 'PROPN', 'coronavirus'),
 ('response', 'NOUN', 'response'),
 ('displayed', 'VERB', 'display'),
 ('grim', 'ADJ', 'grim'),
 ('projection', 'NOUN', 'projection'),
 ('White', 'PROPN', 'White'),
 ('House', 'PROPN', 'House'),
 ('Tuesday', 'PROPN', 'Tuesday'),
 ('calling', 'VERB', 'call'),
 ('real', 'ADJ', 'real'),
 ('number', 'NOUN', 'number'),
 ('pledging', 'VERB', 'pledge'),
 ('possible', 'ADJ', 'possible'),
 ('reduce', 'VERB', 'reduce'),
 ('numbers', 'NOUN', 'number')]

##### Get and print the named entity annotation for each token

In [27]:
dict([(str(x), x.label_) for x in nlp(str(sentences[k])).ents])

{'Anthony S. Fauci': 'PERSON',
 'Deborah L. Birx': 'PERSON',
 'the White House': 'FAC',
 'Tuesday': 'DATE'}

In [29]:
pprint([(x, x.ent_iob_, x.ent_type_) for x in sentences[k]])

[(Anthony, 'B', 'PERSON'),
 (S., 'I', 'PERSON'),
 (Fauci, 'I', 'PERSON'),
 (,, 'O', ''),
 (the, 'O', ''),
 (nation, 'O', ''),
 (’s, 'O', ''),
 (leading, 'O', ''),
 (infectious, 'O', ''),
 (disease, 'O', ''),
 (expert, 'O', ''),
 (,, 'O', ''),
 (and, 'O', ''),
 (Dr., 'O', ''),
 (Deborah, 'B', 'PERSON'),
 (L., 'I', 'PERSON'),
 (Birx, 'I', 'PERSON'),
 (,, 'O', ''),
 (who, 'O', ''),
 (is, 'O', ''),
 (coordinating, 'O', ''),
 (the, 'O', ''),
 (coronavirus, 'O', ''),
 (response, 'O', ''),
 (,, 'O', ''),
 (displayed, 'O', ''),
 (that, 'O', ''),
 (grim, 'O', ''),
 (projection, 'O', ''),
 (at, 'O', ''),
 (the, 'B', 'FAC'),
 (White, 'I', 'FAC'),
 (House, 'I', 'FAC'),
 (on, 'O', ''),
 (Tuesday, 'B', 'DATE'),
 (,, 'O', ''),
 (calling, 'O', ''),
 (it, 'O', ''),
 (“, 'O', ''),
 (our, 'O', ''),
 (real, 'O', ''),
 (number, 'O', ''),
 (”, 'O', ''),
 (but, 'O', ''),
 (pledging, 'O', ''),
 (to, 'O', ''),
 (do, 'O', ''),
 (everything, 'O', ''),
 (possible, 'O', ''),
 (to, 'O', ''),
 (reduce, 'O', ''),
 (t

#### Visualizing the entities of kth sentence

In [31]:
displacy.render(nlp(str(sentences[k])), jupyter=True, style = 'ent')

###### Visualizing all the entities 

In [32]:
displacy.render(nlp(str(sentences)), jupyter=True, style = 'ent')

#### Code for NER in spacy

In [33]:
text =nlp("President Trump strikes a somber note as he warns of a “painful two weeks ahead.”As many as 25 percent of people infected with the new coronavirus may not show symptoms.Ignoring party lines, governors challenge Trump’s assertions on availability of supplies.‘We are in a cage’: Spanish town lives under a lockdown within a lockdown.Americans are putting pride aside to seek aid.Saving its empty hotel rooms for hospital overflow, Las Vegas opens a homeless shelter in a parking lot.ImageMedical workers transferred the bodies of people who had died after contracting the virus to a temporary morgue in Brooklyn on Monday.Credit...Justin Lane/EPA, via ShutterstockModels predicting expected spread of the virus in the U.S. paint a grim picture.")
print(text)

President Trump strikes a somber note as he warns of a “painful two weeks ahead.”As many as 25 percent of people infected with the new coronavirus may not show symptoms.Ignoring party lines, governors challenge Trump’s assertions on availability of supplies.‘We are in a cage’: Spanish town lives under a lockdown within a lockdown.Americans are putting pride aside to seek aid.Saving its empty hotel rooms for hospital overflow, Las Vegas opens a homeless shelter in a parking lot.ImageMedical workers transferred the bodies of people who had died after contracting the virus to a temporary morgue in Brooklyn on Monday.Credit...Justin Lane/EPA, via ShutterstockModels predicting expected spread of the virus in the U.S. paint a grim picture.


In [37]:
#Annotate the token entity
pprint([(x, x.ent_iob_, x.ent_type) for x in text])

[(President, 'O', 0),
 (Trump, 'B', 380),
 (strikes, 'O', 0),
 (a, 'O', 0),
 (somber, 'O', 0),
 (note, 'O', 0),
 (as, 'O', 0),
 (he, 'O', 0),
 (warns, 'O', 0),
 (of, 'O', 0),
 (a, 'O', 0),
 (“, 'O', 0),
 (painful, 'O', 0),
 (two, 'B', 391),
 (weeks, 'I', 391),
 (ahead, 'I', 391),
 (., 'O', 0),
 (”As, 'O', 0),
 (many, 'O', 0),
 (as, 'O', 0),
 (25, 'B', 393),
 (percent, 'I', 393),
 (of, 'O', 0),
 (people, 'O', 0),
 (infected, 'O', 0),
 (with, 'O', 0),
 (the, 'O', 0),
 (new, 'O', 0),
 (coronavirus, 'O', 0),
 (may, 'O', 0),
 (not, 'O', 0),
 (show, 'O', 0),
 (symptoms, 'O', 0),
 (., 'O', 0),
 (Ignoring, 'O', 0),
 (party, 'O', 0),
 (lines, 'O', 0),
 (,, 'O', 0),
 (governors, 'O', 0),
 (challenge, 'O', 0),
 (Trump, 'B', 380),
 (’s, 'O', 0),
 (assertions, 'O', 0),
 (on, 'O', 0),
 (availability, 'O', 0),
 (of, 'O', 0),
 (supplies, 'O', 0),
 (., 'O', 0),
 (‘We, 'B', 383),
 (are, 'O', 0),
 (in, 'O', 0),
 (a, 'O', 0),
 (cage, 'O', 0),
 (’, 'O', 0),
 (:, 'O', 0),
 (Spanish, 'B', 381),
 (town, 'O', 

In [38]:
# Get the named entity
pprint([(x.text, x.label_) for x in text.ents])

[('Trump', 'PERSON'),
 ('two weeks ahead', 'DATE'),
 ('25 percent', 'PERCENT'),
 ('Trump', 'PERSON'),
 ('‘We', 'ORG'),
 ('Spanish', 'NORP'),
 ('Americans', 'NORP'),
 ('Las Vegas', 'GPE'),
 ('ImageMedical', 'ORG'),
 ('Brooklyn', 'GPE'),
 ('Monday', 'DATE'),
 ('Justin Lane/EPA', 'ORG'),
 ('ShutterstockModels', 'ORG'),
 ('U.S.', 'GPE')]


In [39]:
#Detailed information for named entity
for ent in text.ents:
    print(ent.text, ent.start_char, ent.end_char,
         ent.label_, spacy.explain(ent.label_))

Trump 10 15 PERSON People, including fictional
two weeks ahead 64 79 DATE Absolute or relative dates or periods
25 percent 92 102 PERCENT Percentage, including "%"
Trump 211 216 PERSON People, including fictional
‘We 258 261 ORG Companies, agencies, institutions, etc.
Spanish 278 285 NORP Nationalities or religious or political groups
Americans 332 341 NORP Nationalities or religious or political groups
Las Vegas 430 439 GPE Countries, cities, states
ImageMedical 482 494 ORG Companies, agencies, institutions, etc.
Brooklyn 602 610 GPE Countries, cities, states
Monday 614 620 DATE Absolute or relative dates or periods
Justin Lane/EPA 630 645 ORG Companies, agencies, institutions, etc.
ShutterstockModels 651 669 ORG Companies, agencies, institutions, etc.
U.S. 717 721 GPE Countries, cities, states


In [40]:
# visualization for named entity
displacy.render(text, jupyter=True, style='ent')

In [42]:
displacy.render(text, style='dep', jupyter=True, options ={'distance':75})

In [43]:
options={'distance':75, 'compact':'True', 'color':'yellow', 'bg':'#09a3d5', 'font':'Times'}
displacy.render(text, style='dep', options=options, jupyter='True')

###  De identification

In [62]:
def _replace_person_name(token):
    if token.ent_iob != 0 and token.ent_type_ == 'PERSON':
        return '[REDACTED]'
    return token.string

In [65]:
def _redact_names(nlp_doc):
    for ent in nlp_doc.ents:
        ent.merge()
        tokens = map(_replace_person_name, nlp_doc)
        return ''.join(tokens)

In [66]:
redacted = _redact_names(text)
print(redacted)

     Grim Models Project High U.S. Toll - The New York Times                                                                                           SectionsSEARCHSkip to contentSkip to site indexWorldLog InLog InToday’s PaperWorld|White House Projects Grim Toll From Coronavirushttps://nyti.ms/3aEdYAY                               The Coronavirus Outbreak                                                                                                             •                                   Latest Updates                                                                               Guide to Financial Help                                                                    Virus Tracker                                                                    Common Questions                                                                    What You Can Do                                                                    Newsletter                                         Advertisement

In [67]:
displacy.render(nlp(redacted), jupyter=True, style='ent')