In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from pprint import pprint

In [2]:
text = """For nearly a decade, they argue, Facebook has made “serial defensive acquisitions” to protect its dominant position in the market for social networks, according to slides they have shown government officials. Scooping up nascent rivals, they assert, can allow Facebook to charge advertisers higher prices and can give users worse experience."""

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [6]:
token_tag = preprocess(text)
pprint(token_tag)

[('For', 'IN'),
 ('nearly', 'RB'),
 ('a', 'DT'),
 ('decade', 'NN'),
 (',', ','),
 ('they', 'PRP'),
 ('argue', 'VBP'),
 (',', ','),
 ('Facebook', 'NNP'),
 ('has', 'VBZ'),
 ('made', 'VBN'),
 ('“', 'NNP'),
 ('serial', 'JJ'),
 ('defensive', 'JJ'),
 ('acquisitions', 'NNS'),
 ('”', 'VBP'),
 ('to', 'TO'),
 ('protect', 'VB'),
 ('its', 'PRP$'),
 ('dominant', 'JJ'),
 ('position', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('market', 'NN'),
 ('for', 'IN'),
 ('social', 'JJ'),
 ('networks', 'NNS'),
 (',', ','),
 ('according', 'VBG'),
 ('to', 'TO'),
 ('slides', 'NNS'),
 ('they', 'PRP'),
 ('have', 'VBP'),
 ('shown', 'VBN'),
 ('government', 'NN'),
 ('officials', 'NNS'),
 ('.', '.'),
 ('Scooping', 'VBG'),
 ('up', 'RP'),
 ('nascent', 'JJ'),
 ('rivals', 'NNS'),
 (',', ','),
 ('they', 'PRP'),
 ('assert', 'VBP'),
 (',', ','),
 ('can', 'MD'),
 ('allow', 'VB'),
 ('Facebook', 'NNP'),
 ('to', 'TO'),
 ('charge', 'VB'),
 ('advertisers', 'NNS'),
 ('higher', 'JJR'),
 ('prices', 'NNS'),
 ('and', 'CC'),
 ('can', 'MD'),


In [32]:
#Noun Phrase 
np_pattern = 'NP:{<DT>?<JJ.*>*<NN.*>+}'

In [33]:
noun_ner = nltk.RegexpParser(np_pattern)
ner_extraction = noun_ner.parse(token_tag)
print(ner_extraction)

(S
  For/IN
  nearly/RB
  (NP a/DT decade/NN)
  ,/,
  they/PRP
  argue/VBP
  ,/,
  (NP Facebook/NNP)
  has/VBZ
  made/VBN
  (NP “/NNP)
  (NP serial/JJ defensive/JJ acquisitions/NNS)
  ”/VBP
  to/TO
  protect/VB
  its/PRP$
  (NP dominant/JJ position/NN)
  in/IN
  (NP the/DT market/NN)
  for/IN
  (NP social/JJ networks/NNS)
  ,/,
  according/VBG
  to/TO
  (NP slides/NNS)
  they/PRP
  have/VBP
  shown/VBN
  (NP government/NN officials/NNS)
  ./.
  Scooping/VBG
  up/RP
  (NP nascent/JJ rivals/NNS)
  ,/,
  they/PRP
  assert/VBP
  ,/,
  can/MD
  allow/VB
  (NP Facebook/NNP)
  to/TO
  charge/VB
  (NP advertisers/NNS)
  (NP higher/JJR prices/NNS)
  and/CC
  can/MD
  give/VB
  (NP users/NNS)
  (NP worse/JJR experience/NN)
  ./.)


In [10]:
from nltk.chunk import conlltags2tree, tree2conlltags

In [34]:
ner_tags = tree2conlltags(ner_extraction)
pprint(ner_tags)

[('For', 'IN', 'O'),
 ('nearly', 'RB', 'O'),
 ('a', 'DT', 'B-NP'),
 ('decade', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('they', 'PRP', 'O'),
 ('argue', 'VBP', 'O'),
 (',', ',', 'O'),
 ('Facebook', 'NNP', 'B-NP'),
 ('has', 'VBZ', 'O'),
 ('made', 'VBN', 'O'),
 ('“', 'NNP', 'B-NP'),
 ('serial', 'JJ', 'B-NP'),
 ('defensive', 'JJ', 'I-NP'),
 ('acquisitions', 'NNS', 'I-NP'),
 ('”', 'VBP', 'O'),
 ('to', 'TO', 'O'),
 ('protect', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('dominant', 'JJ', 'B-NP'),
 ('position', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('market', 'NN', 'I-NP'),
 ('for', 'IN', 'O'),
 ('social', 'JJ', 'B-NP'),
 ('networks', 'NNS', 'I-NP'),
 (',', ',', 'O'),
 ('according', 'VBG', 'O'),
 ('to', 'TO', 'O'),
 ('slides', 'NNS', 'B-NP'),
 ('they', 'PRP', 'O'),
 ('have', 'VBP', 'O'),
 ('shown', 'VBN', 'O'),
 ('government', 'NN', 'B-NP'),
 ('officials', 'NNS', 'I-NP'),
 ('.', '.', 'O'),
 ('Scooping', 'VBG', 'O'),
 ('up', 'RP', 'O'),
 ('nascent', 'JJ', 'B-NP'),
 ('rivals', 'NNS', '

In [14]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/venkateshmurugadas/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [17]:
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/venkateshmurugadas/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [26]:
#So to do all the above in one function , there is a function named ne_chunk 

ner_chunk = nltk.ne_chunk(pos_tag(word_tokenize(text)))
for noun in str(ner_chunk).split('\n'):
    if '/NN' in noun:
        print(noun)

  decade/NN
  (PERSON Facebook/NNP)
  “/NNP
  acquisitions/NNS
  position/NN
  market/NN
  networks/NNS
  slides/NNS
  government/NN
  officials/NNS
  rivals/NNS
  (PERSON Facebook/NNP)
  advertisers/NNS
  prices/NNS
  users/NNS
  experience/NN
