## Named Entity Recognition and De-Identification with SpaCy

#### Importing Libraries


In [2]:
import requests
from bs4 import BeautifulSoup
import spacy
from spacy import displacy
import random
import en_core_web_sm
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import pandas as pd
import nltk


### TASK-01

##### 1.1 Scrapring webpage 

In [3]:
r = requests.get("https://www.nbcnews.com/news/world/hong-kong-finance-summit-covid-rcna55498")
soup = BeautifulSoup(r.content,  "html.parser")
soup = soup.find('div',class_='article-body__content')
text = soup.get_text()
print(text[:500],'.....')

HONG KONG — After mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, Hong Kong is ready to turn the page. “Social disturbance is clearly in the past,” the city’s leader, John Lee, said Wednesday at the Four Seasons Hotel, where about 200 finance industry executives from around the world were gathered for a summit. “It has given way to stability, to growing business and community confidence in Hong Kong’s fu .....


##### 1.2 NER in SpaCy

###### 1.2.1 Named Entities 

In [4]:
nlp = en_core_web_sm.load()
spacy_obj= nlp(text)



In [5]:
## Entity counts
print('Entity counts:')
# Count every named entity
labels = [x.label_ for x in spacy_obj.ents]
from collections import Counter
counts = Counter(labels)
print(counts)


Entity counts:
Counter({'GPE': 51, 'DATE': 33, 'PERSON': 30, 'CARDINAL': 15, 'ORG': 12, 'NORP': 10, 'LOC': 2, 'PERCENT': 2, 'FAC': 1, 'ORDINAL': 1})


In [6]:
## Entities
print('Entitites:')
for x in spacy_obj.ents:
  print(dict(text=x.text, label=x.label_, start_char=x.start_char, end_char=x.start_char+len(x.text)))

Entitites:
{'text': 'HONG KONG', 'label': 'GPE', 'start_char': 0, 'end_char': 9}
{'text': '2019', 'label': 'DATE', 'start_char': 33, 'end_char': 37}
{'text': 'Hong Kong', 'label': 'GPE', 'start_char': 159, 'end_char': 168}
{'text': 'John Lee', 'label': 'PERSON', 'start_char': 260, 'end_char': 268}
{'text': 'Wednesday', 'label': 'DATE', 'start_char': 275, 'end_char': 284}
{'text': 'the Four Seasons Hotel', 'label': 'ORG', 'start_char': 288, 'end_char': 310}
{'text': 'about 200', 'label': 'CARDINAL', 'start_char': 318, 'end_char': 327}
{'text': 'Hong Kong’s', 'label': 'GPE', 'start_char': 486, 'end_char': 497}
{'text': 'Chinese', 'label': 'NORP', 'start_char': 600, 'end_char': 607}
{'text': 'Hong Kong’s', 'label': 'GPE', 'start_char': 738, 'end_char': 749}
{'text': 'China', 'label': 'GPE', 'start_char': 771, 'end_char': 776}
{'text': 'American', 'label': 'NORP', 'start_char': 902, 'end_char': 910}
{'text': 'China', 'label': 'GPE', 'start_char': 1004, 'end_char': 1009}
{'text': 'U.S.', 'l

In [7]:
## Token labels
print('Token-labels:')
for x in spacy_obj:
  print(dict(text=x, prefix=x.ent_iob_, entity_type=x.ent_type_))

Token-labels:
{'text': HONG, 'prefix': 'B', 'entity_type': 'GPE'}
{'text': KONG, 'prefix': 'I', 'entity_type': 'GPE'}
{'text': —, 'prefix': 'O', 'entity_type': ''}
{'text': After, 'prefix': 'O', 'entity_type': ''}
{'text': mass, 'prefix': 'O', 'entity_type': ''}
{'text': unrest, 'prefix': 'O', 'entity_type': ''}
{'text': in, 'prefix': 'O', 'entity_type': ''}
{'text': 2019, 'prefix': 'B', 'entity_type': 'DATE'}
{'text': ,, 'prefix': 'O', 'entity_type': ''}
{'text': a, 'prefix': 'O', 'entity_type': ''}
{'text': pandemic, 'prefix': 'O', 'entity_type': ''}
{'text': that, 'prefix': 'O', 'entity_type': ''}
{'text': left, 'prefix': 'O', 'entity_type': ''}
{'text': it, 'prefix': 'O', 'entity_type': ''}
{'text': isolated, 'prefix': 'O', 'entity_type': ''}
{'text': from, 'prefix': 'O', 'entity_type': ''}
{'text': the, 'prefix': 'O', 'entity_type': ''}
{'text': world, 'prefix': 'O', 'entity_type': ''}
{'text': and, 'prefix': 'O', 'entity_type': ''}
{'text': the, 'prefix': 'O', 'entity_type': ''}


###### 1.2.2 Most frequent tokens

In [8]:
tokens = [token.text for token in spacy_obj]
token_counts = Counter(tokens)

print(token_counts)

Counter({',': 80, 'the': 73, '.': 57, 'to': 41, 'and': 39, 'a': 36, 'Kong': 34, 'in': 33, 'Hong': 32, 'of': 29, 'that': 20, '“': 19, '’s': 18, 'for': 17, '”': 16, 'as': 15, 'on': 15, 'said': 14, 'is': 13, '\xa0': 12, 'have': 12, 'business': 11, '-': 11, 'from': 10, 'law': 10, 'summit': 9, 'was': 9, 'security': 8, 'has': 8, 'with': 8, 'are': 8, 'an': 8, 'national': 7, 'Lee': 7, 'about': 7, 'been': 7, 'China': 7, 'he': 7, 'which': 7, 'it': 6, 'at': 6, 'restrictions': 6, 'people': 6, 'not': 6, 'world': 5, 'Chinese': 5, 'financial': 5, 'by': 5, '/': 5, 'Getty': 5, 'out': 5, 'Covid': 5, 'last': 5, 'report': 5, 'more': 5, '2019': 4, 'city': 4, ';': 4, 'its': 4, 'them': 4, 'who': 4, 'say': 4, 'they': 4, 'center': 4, 'Paul': 4, 'international': 4, 'there': 4, 'after': 4, 'rule': 4, 'government': 4, 'month': 4, 'The': 4, 'global': 4, 'than': 4, 'be': 4, '—': 3, '200': 3, 'finance': 3, 'stability': 3, 'some': 3, 'many': 3, 'democracy': 3, 'very': 3, 'top': 3, 'Yeung': 3, 'Bloomberg': 3, 'via': 3

###### 1.2.3 Picking a random integer K using Python random module, then pick three consecutive sentences starting with Kth sentence, and print these sentences

In [9]:


sentences_original = sent_tokenize(text)
# for i,k in range(0, len(sentences_original)): 
k = random.randint(0, len(sentences_original)-3)
a = []
for j, sentence in enumerate(sentences_original):
    if j == k:
        
        kth = sentences_original[k],
        kkth = sentences_original[k+1],
        kkkth = sentences_original[k+2]
        a.append(sentences_original[k]+sentences_original[k+1] + sentences_original[k+2])
        print(k,kth),
        print(k+1,kkth),
        print(k+2,kkth)
     

6 ('And as the subject of U.S. sanctions himself, Lee is unable to hold an account with many of the very banks he was addressing.John Lee, Hong Kong’s top leader, center, at the finance summit on Wednesday.',)
7 ('Paul Yeung / Bloomberg via Getty ImagesNone of that seems to have dampened the mood at this week’s events, which started on Monday with a fintech conference and conclude this weekend with the return of a major international sporting event, the Rugby Sevens tournament.Iñaki Amate, chairman of the European Chamber of Commerce in Hong Kong, was upbeat about the event and the pro-business atmosphere he encountered.',)
8 ('Paul Yeung / Bloomberg via Getty ImagesNone of that seems to have dampened the mood at this week’s events, which started on Monday with a fintech conference and conclude this weekend with the return of a major international sporting event, the Rugby Sevens tournament.Iñaki Amate, chairman of the European Chamber of Commerce in Hong Kong, was upbeat about the eve

###### 1.2.4 Extract Part of speech and lemmatize

In [10]:
a_string = ' '.join([str(item) for item in a])
text1 = word_tokenize(a_string)
print('POS TAG :',nltk.pos_tag(text1))
tokenizer = RegexpTokenizer(r'\w+')
mytext = tokenizer.tokenize(a_string.lower())
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in mytext]
print("lemmatized_words:",lemmatized_words)

POS TAG : [('And', 'CC'), ('as', 'IN'), ('the', 'DT'), ('subject', 'NN'), ('of', 'IN'), ('U.S.', 'NNP'), ('sanctions', 'NNS'), ('himself', 'PRP'), (',', ','), ('Lee', 'NNP'), ('is', 'VBZ'), ('unable', 'JJ'), ('to', 'TO'), ('hold', 'VB'), ('an', 'DT'), ('account', 'NN'), ('with', 'IN'), ('many', 'JJ'), ('of', 'IN'), ('the', 'DT'), ('very', 'RB'), ('banks', 'NNS'), ('he', 'PRP'), ('was', 'VBD'), ('addressing.John', 'JJ'), ('Lee', 'NNP'), (',', ','), ('Hong', 'NNP'), ('Kong', 'NNP'), ('’', 'NNP'), ('s', 'JJ'), ('top', 'JJ'), ('leader', 'NN'), (',', ','), ('center', 'NN'), (',', ','), ('at', 'IN'), ('the', 'DT'), ('finance', 'NN'), ('summit', 'NN'), ('on', 'IN'), ('Wednesday.Paul', 'NNP'), ('Yeung', 'NNP'), ('/', 'NNP'), ('Bloomberg', 'NNP'), ('via', 'IN'), ('Getty', 'NNP'), ('ImagesNone', 'NNP'), ('of', 'IN'), ('that', 'DT'), ('seems', 'VBZ'), ('to', 'TO'), ('have', 'VB'), ('dampened', 'VBN'), ('the', 'DT'), ('mood', 'NN'), ('at', 'IN'), ('this', 'DT'), ('week', 'NN'), ('’', 'NNP'), ('s',

In [11]:
def tagger(sentence, nlp=nlp):

  print('')
  print('Original sentence')
  print(sentence)
  
  spacy_obj= nlp(sentence)

  #print('Entitites:')
  rows = []
  for x in spacy_obj.ents:
    rows.append(dict(text=x.text, label=x.label_, start_char=x.start_char, end_char=x.start_char+len(x.text)))

  df = pd.DataFrame(rows)
  print(df)

  # Visualize Entities
  displacy.render(spacy_obj, jupyter=True, style='ent')

In [12]:
tagger(a_string)



Original sentence
And as the subject of U.S. sanctions himself, Lee is unable to hold an account with many of the very banks he was addressing.John Lee, Hong Kong’s top leader, center, at the finance summit on Wednesday.Paul Yeung / Bloomberg via Getty ImagesNone of that seems to have dampened the mood at this week’s events, which started on Monday with a fintech conference and conclude this weekend with the return of a major international sporting event, the Rugby Sevens tournament.Iñaki Amate, chairman of the European Chamber of Commerce in Hong Kong, was upbeat about the event and the pro-business atmosphere he encountered.“I have to say how positively surprised I was to see that there was a very good vibe,” Amate said Tuesday as he headed for a tour and welcome dinner at the M+ art museum.
                                text     label  start_char  end_char
0                               U.S.       GPE          22        26
1                                Lee    PERSON          

###### 1.2.5 Entity Annoation for kth sentence

In [13]:
k_string = ' '.join([str(item) for item in kth])

spacy_obj1= nlp(k_string)
print('Entity Annotation of kth sentence:\n')
for x in spacy_obj1:
  print(dict(text=x, prefix=x.ent_iob_, entity_type=x.ent_type_))

Entity Annotation of kth sentence:

{'text': And, 'prefix': 'O', 'entity_type': ''}
{'text': as, 'prefix': 'O', 'entity_type': ''}
{'text': the, 'prefix': 'O', 'entity_type': ''}
{'text': subject, 'prefix': 'O', 'entity_type': ''}
{'text': of, 'prefix': 'O', 'entity_type': ''}
{'text': U.S., 'prefix': 'B', 'entity_type': 'GPE'}
{'text': sanctions, 'prefix': 'O', 'entity_type': ''}
{'text': himself, 'prefix': 'O', 'entity_type': ''}
{'text': ,, 'prefix': 'O', 'entity_type': ''}
{'text': Lee, 'prefix': 'B', 'entity_type': 'PERSON'}
{'text': is, 'prefix': 'O', 'entity_type': ''}
{'text': unable, 'prefix': 'O', 'entity_type': ''}
{'text': to, 'prefix': 'O', 'entity_type': ''}
{'text': hold, 'prefix': 'O', 'entity_type': ''}
{'text': an, 'prefix': 'O', 'entity_type': ''}
{'text': account, 'prefix': 'O', 'entity_type': ''}
{'text': with, 'prefix': 'O', 'entity_type': ''}
{'text': many, 'prefix': 'O', 'entity_type': ''}
{'text': of, 'prefix': 'O', 'entity_type': ''}
{'text': the, 'prefix': 'O

###### 1.2.6.1  Visualize entities of Kth sentence

In [14]:
displacy.render(spacy_obj1,jupyter=True,style ='ent')

###### 1.2.6.1  Visualize dependencies of Kth sentence

In [15]:
displacy.serve(spacy_obj1, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


##### 1.2.7 Visualize entities of entire document

In [16]:
displacy.render(spacy_obj,jupyter=True, style="ent")

### TASK-02 - Replacing the NER - PERSON to "[REDACTED]"

In [35]:
!pip install re
import re

def replace_name(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    else:
        return token.text

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_name, doc)
    return " ".join(tokens)

redacted = scrub(text)
displacy.render(nlp(redacted), style="ent")


ERROR: Could not find a version that satisfies the requirement re (from versions: none)
ERROR: No matching distribution found for re
