# NER

# 1. NLTK

In [1]:
import nltk
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/saishruthi.tn@ibm.com/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/saishruthi.tn@ibm.com/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/saishruthi.tn@ibm.com/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/saishruthi.tn@ibm.com/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

Data from : https://www.cnn.com/2019/01/20/tech/facebook-sheryl-sandberg-election-interference/index.html

In [3]:
input_data = "Facebook will partner with German officials ahead of the European Union elections in May to crack down on fake accounts and misinformation.Sheryl Sandberg said the company will work with the German Federal Office for Information and Security as well as other companies and research partners to help guide policy making in Germany and across the EU on election interference."

In [4]:
input_data

'Facebook will partner with German officials ahead of the European Union elections in May to crack down on fake accounts and misinformation.Sheryl Sandberg said the company will work with the German Federal Office for Information and Security as well as other companies and research partners to help guide policy making in Germany and across the EU on election interference.'

## 1.1 Tokenize 

In [5]:
input_data_tokens = nltk.word_tokenize(input_data)

In [6]:
input_data_tokens

['Facebook',
 'will',
 'partner',
 'with',
 'German',
 'officials',
 'ahead',
 'of',
 'the',
 'European',
 'Union',
 'elections',
 'in',
 'May',
 'to',
 'crack',
 'down',
 'on',
 'fake',
 'accounts',
 'and',
 'misinformation.Sheryl',
 'Sandberg',
 'said',
 'the',
 'company',
 'will',
 'work',
 'with',
 'the',
 'German',
 'Federal',
 'Office',
 'for',
 'Information',
 'and',
 'Security',
 'as',
 'well',
 'as',
 'other',
 'companies',
 'and',
 'research',
 'partners',
 'to',
 'help',
 'guide',
 'policy',
 'making',
 'in',
 'Germany',
 'and',
 'across',
 'the',
 'EU',
 'on',
 'election',
 'interference',
 '.']

## 1.2 Extract POS and IOB tag with entity

In [7]:
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(input_data)))
iob_tagged = tree2conlltags(ne_tree)
print (iob_tagged)

[('Facebook', 'NNP', 'B-GPE'), ('will', 'MD', 'O'), ('partner', 'NN', 'O'), ('with', 'IN', 'O'), ('German', 'JJ', 'B-GPE'), ('officials', 'NNS', 'O'), ('ahead', 'RB', 'O'), ('of', 'IN', 'O'), ('the', 'DT', 'O'), ('European', 'NNP', 'B-ORGANIZATION'), ('Union', 'NNP', 'I-ORGANIZATION'), ('elections', 'NNS', 'O'), ('in', 'IN', 'O'), ('May', 'NNP', 'O'), ('to', 'TO', 'O'), ('crack', 'VB', 'O'), ('down', 'RP', 'O'), ('on', 'IN', 'O'), ('fake', 'JJ', 'O'), ('accounts', 'NNS', 'O'), ('and', 'CC', 'O'), ('misinformation.Sheryl', 'NN', 'O'), ('Sandberg', 'NNP', 'B-PERSON'), ('said', 'VBD', 'O'), ('the', 'DT', 'O'), ('company', 'NN', 'O'), ('will', 'MD', 'O'), ('work', 'VB', 'O'), ('with', 'IN', 'O'), ('the', 'DT', 'O'), ('German', 'JJ', 'B-ORGANIZATION'), ('Federal', 'NNP', 'I-ORGANIZATION'), ('Office', 'NNP', 'O'), ('for', 'IN', 'O'), ('Information', 'NNP', 'B-ORGANIZATION'), ('and', 'CC', 'O'), ('Security', 'NNP', 'B-ORGANIZATION'), ('as', 'RB', 'O'), ('well', 'RB', 'O'), ('as', 'IN', 'O'), 

## 1.3 part of speech tagging

In [8]:
input_data_pos = nltk.pos_tag(input_data_tokens)

In [9]:
input_data_pos

[('Facebook', 'NNP'),
 ('will', 'MD'),
 ('partner', 'NN'),
 ('with', 'IN'),
 ('German', 'JJ'),
 ('officials', 'NNS'),
 ('ahead', 'RB'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('European', 'NNP'),
 ('Union', 'NNP'),
 ('elections', 'NNS'),
 ('in', 'IN'),
 ('May', 'NNP'),
 ('to', 'TO'),
 ('crack', 'VB'),
 ('down', 'RP'),
 ('on', 'IN'),
 ('fake', 'JJ'),
 ('accounts', 'NNS'),
 ('and', 'CC'),
 ('misinformation.Sheryl', 'NN'),
 ('Sandberg', 'NNP'),
 ('said', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('will', 'MD'),
 ('work', 'VB'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('German', 'JJ'),
 ('Federal', 'NNP'),
 ('Office', 'NNP'),
 ('for', 'IN'),
 ('Information', 'NNP'),
 ('and', 'CC'),
 ('Security', 'NNP'),
 ('as', 'RB'),
 ('well', 'RB'),
 ('as', 'IN'),
 ('other', 'JJ'),
 ('companies', 'NNS'),
 ('and', 'CC'),
 ('research', 'NN'),
 ('partners', 'NNS'),
 ('to', 'TO'),
 ('help', 'VB'),
 ('guide', 'VB'),
 ('policy', 'NN'),
 ('making', 'NN'),
 ('in', 'IN'),
 ('Germany', 'NNP'),
 ('and', 'CC'),
 ('across', 'I

In [12]:
#Chunk the sentence
# Extracting noun phrase: formed whenever the chunker finds an optional determiner (DT), 
# followed by any number of adjectives (JJ) , and then a noun, (NN).
pattern = 'NP: {<DT>?<JJ>*<NN>}'
chunk_pattern = nltk.RegexpParser(pattern)
input_data_chunk = chunk_pattern.parse(input_data_pos)
print(input_data_chunk)

(S
  Facebook/NNP
  will/MD
  (NP partner/NN)
  with/IN
  German/JJ
  officials/NNS
  ahead/RB
  of/IN
  the/DT
  European/NNP
  Union/NNP
  elections/NNS
  in/IN
  May/NNP
  to/TO
  crack/VB
  down/RP
  on/IN
  fake/JJ
  accounts/NNS
  and/CC
  (NP misinformation.Sheryl/NN)
  Sandberg/NNP
  said/VBD
  (NP the/DT company/NN)
  will/MD
  work/VB
  with/IN
  the/DT
  German/JJ
  Federal/NNP
  Office/NNP
  for/IN
  Information/NNP
  and/CC
  Security/NNP
  as/RB
  well/RB
  as/IN
  other/JJ
  companies/NNS
  and/CC
  (NP research/NN)
  partners/NNS
  to/TO
  help/VB
  guide/VB
  (NP policy/NN)
  (NP making/NN)
  in/IN
  Germany/NNP
  and/CC
  across/IN
  the/DT
  EU/NNP
  on/IN
  (NP election/NN)
  (NP interference/NN)
  ./.)


In [13]:
input_data_chunk.draw()

In [14]:
#Represent chunk in IOB format. Format: Token, POS tag, chunk tage
input_iob_tagged = tree2conlltags(input_data_chunk)
pprint(input_iob_tagged)

[('Facebook', 'NNP', 'O'),
 ('will', 'MD', 'O'),
 ('partner', 'NN', 'B-NP'),
 ('with', 'IN', 'O'),
 ('German', 'JJ', 'O'),
 ('officials', 'NNS', 'O'),
 ('ahead', 'RB', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('European', 'NNP', 'O'),
 ('Union', 'NNP', 'O'),
 ('elections', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('May', 'NNP', 'O'),
 ('to', 'TO', 'O'),
 ('crack', 'VB', 'O'),
 ('down', 'RP', 'O'),
 ('on', 'IN', 'O'),
 ('fake', 'JJ', 'O'),
 ('accounts', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('misinformation.Sheryl', 'NN', 'B-NP'),
 ('Sandberg', 'NNP', 'O'),
 ('said', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('will', 'MD', 'O'),
 ('work', 'VB', 'O'),
 ('with', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('German', 'JJ', 'O'),
 ('Federal', 'NNP', 'O'),
 ('Office', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('Information', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('Security', 'NNP', 'O'),
 ('as', 'RB', 'O'),
 ('well', 'RB', 'O'),
 ('as', 'IN', 'O'),
 ('other', 'JJ', 'O'),
 ('companies', 'NNS

In [10]:
ne_tree = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(input_data)))
print(ne_tree)

(S
  (GPE Facebook/NNP)
  will/MD
  partner/NN
  with/IN
  (GPE German/JJ)
  officials/NNS
  ahead/RB
  of/IN
  the/DT
  (ORGANIZATION European/NNP Union/NNP)
  elections/NNS
  in/IN
  May/NNP
  to/TO
  crack/VB
  down/RP
  on/IN
  fake/JJ
  accounts/NNS
  and/CC
  misinformation.Sheryl/NN
  (PERSON Sandberg/NNP)
  said/VBD
  the/DT
  company/NN
  will/MD
  work/VB
  with/IN
  the/DT
  (ORGANIZATION German/JJ Federal/NNP)
  Office/NNP
  for/IN
  (ORGANIZATION Information/NNP)
  and/CC
  (ORGANIZATION Security/NNP)
  as/RB
  well/RB
  as/IN
  other/JJ
  companies/NNS
  and/CC
  research/NN
  partners/NNS
  to/TO
  help/VB
  guide/VB
  policy/NN
  making/NN
  in/IN
  (GPE Germany/NNP)
  and/CC
  across/IN
  the/DT
  (GPE EU/NNP)
  on/IN
  election/NN
  interference/NN
  ./.)


# 2. SpaCy

https://spacy.io/api/annotation#section-named-entities

SpaCy models : https://github.com/explosion/spacy-models

In [15]:
import spacy
import en_core_web_sm

In [16]:
nlp_model = en_core_web_sm.load()

In [17]:
data = nlp_model('Facebook will partner with German officials ahead of the European Union elections in May to crack down on fake accounts and misinformation.Sheryl Sandberg said the company will work with the German Federal Office for Information and Security as well as other companies and research partners to help guide policy making in Germany and across the EU on election interference.')

In [18]:
#Displaying entity
pprint([(X.text, X.label_) for X in data.ents])

[('Facebook', 'ORG'),
 ('German', 'NORP'),
 ('the European Union', 'ORG'),
 ('May', 'DATE'),
 ('Sheryl Sandberg', 'PERSON'),
 ('the German Federal Office for Information and Security', 'ORG'),
 ('Germany', 'GPE'),
 ('EU', 'ORG')]


## 2.1 SpaCy POS tag

https://spacy.io/usage/linguistic-features#section-pos-tagging

https://spacy.io/api/token#attributes

In [22]:
for token in data:
    print(token.text, token.lemma_, token.pos_, token.tag_)

Facebook facebook PROPN NNP
will will VERB MD
partner partner VERB VB
with with ADP IN
German german ADJ JJ
officials official NOUN NNS
ahead ahead ADV RB
of of ADP IN
the the DET DT
European european PROPN NNP
Union union PROPN NNP
elections election NOUN NNS
in in ADP IN
May may PROPN NNP
to to PART TO
crack crack VERB VB
down down PART RP
on on ADP IN
fake fake ADJ JJ
accounts account NOUN NNS
and and CCONJ CC
misinformation misinformation NOUN NN
. . PUNCT .
Sheryl sheryl PROPN NNP
Sandberg sandberg PROPN NNP
said say VERB VBD
the the DET DT
company company NOUN NN
will will VERB MD
work work VERB VB
with with ADP IN
the the DET DT
German german PROPN NNP
Federal federal PROPN NNP
Office office PROPN NNP
for for ADP IN
Information information PROPN NNP
and and CCONJ CC
Security security PROPN NNP
as as ADV RB
well well ADV RB
as as ADP IN
other other ADJ JJ
companies company NOUN NNS
and and CCONJ CC
research research NOUN NN
partners partner NOUN NNS
to to PART TO
help help VERB V

# 2.2 Entity tag

- BILUO Scheme tag: https://spacy.io/api/annotation#section-pos-tagging
- Entity: https://spacy.io/api/annotation#section-named-entities

In [19]:
#token level entity annotation
pprint([(X, X.ent_iob_, X.ent_type_) for X in data])

[(Facebook, 'B', 'ORG'),
 (will, 'O', ''),
 (partner, 'O', ''),
 (with, 'O', ''),
 (German, 'B', 'NORP'),
 (officials, 'O', ''),
 (ahead, 'O', ''),
 (of, 'O', ''),
 (the, 'B', 'ORG'),
 (European, 'I', 'ORG'),
 (Union, 'I', 'ORG'),
 (elections, 'O', ''),
 (in, 'O', ''),
 (May, 'B', 'DATE'),
 (to, 'O', ''),
 (crack, 'O', ''),
 (down, 'O', ''),
 (on, 'O', ''),
 (fake, 'O', ''),
 (accounts, 'O', ''),
 (and, 'O', ''),
 (misinformation, 'O', ''),
 (., 'O', ''),
 (Sheryl, 'B', 'PERSON'),
 (Sandberg, 'I', 'PERSON'),
 (said, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (will, 'O', ''),
 (work, 'O', ''),
 (with, 'O', ''),
 (the, 'B', 'ORG'),
 (German, 'I', 'ORG'),
 (Federal, 'I', 'ORG'),
 (Office, 'I', 'ORG'),
 (for, 'I', 'ORG'),
 (Information, 'I', 'ORG'),
 (and, 'I', 'ORG'),
 (Security, 'I', 'ORG'),
 (as, 'O', ''),
 (well, 'O', ''),
 (as, 'O', ''),
 (other, 'O', ''),
 (companies, 'O', ''),
 (and, 'O', ''),
 (research, 'O', ''),
 (partners, 'O', ''),
 (to, 'O', ''),
 (help, 'O', ''),
 (guid

In [20]:
from spacy import displacy
displacy.render(nlp_model(str(data)), jupyter=True, style='ent')

# 3. Using NLTK with SpaCy

In [41]:
sentence = 'Facebook will partner with German officials ahead of the European Union elections in May to crack down on fake accounts and misinformation.Sheryl Sandberg said the company will work with the German Federal Office for Information and Security as well as other companies and research partners to help guide policy making in Germany and across the EU on election interference.'

In [42]:
spacy_sentence = nlp_model(sentence)

In [43]:
spacy_sentence

Facebook will partner with German officials ahead of the European Union elections in May to crack down on fake accounts and misinformation.Sheryl Sandberg said the company will work with the German Federal Office for Information and Security as well as other companies and research partners to help guide policy making in Germany and across the EU on election interference.

In [44]:
iob_tagged = []
for token in spacy_sentence:
    if token.ent_iob_ != 'O':
        iob_tagged.append((token.text, token.tag_, "{0}-{1}".format(token.ent_iob_, token.ent_type_)))
    else:
        iob_tagged.append((token.text, token.tag_,token.ent_iob_))

In [45]:
iob_tagged

[('Facebook', 'NNP', 'B-ORG'),
 ('will', 'MD', 'O'),
 ('partner', 'VB', 'O'),
 ('with', 'IN', 'O'),
 ('German', 'JJ', 'B-NORP'),
 ('officials', 'NNS', 'O'),
 ('ahead', 'RB', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'B-ORG'),
 ('European', 'NNP', 'I-ORG'),
 ('Union', 'NNP', 'I-ORG'),
 ('elections', 'NNS', 'O'),
 ('in', 'IN', 'O'),
 ('May', 'NNP', 'B-DATE'),
 ('to', 'TO', 'O'),
 ('crack', 'VB', 'O'),
 ('down', 'RP', 'O'),
 ('on', 'IN', 'O'),
 ('fake', 'JJ', 'O'),
 ('accounts', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('misinformation', 'NN', 'O'),
 ('.', '.', 'O'),
 ('Sheryl', 'NNP', 'B-PERSON'),
 ('Sandberg', 'NNP', 'I-PERSON'),
 ('said', 'VBD', 'O'),
 ('the', 'DT', 'O'),
 ('company', 'NN', 'O'),
 ('will', 'MD', 'O'),
 ('work', 'VB', 'O'),
 ('with', 'IN', 'O'),
 ('the', 'DT', 'B-ORG'),
 ('German', 'NNP', 'I-ORG'),
 ('Federal', 'NNP', 'I-ORG'),
 ('Office', 'NNP', 'I-ORG'),
 ('for', 'IN', 'I-ORG'),
 ('Information', 'NNP', 'I-ORG'),
 ('and', 'CC', 'I-ORG'),
 ('Security', 'NNP', 'I-ORG'),
 ('as',