In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk import RegexpParser
from nltk import ne_chunk


def text_preprocess(document):
    sentences = sent_tokenize(document)
    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(sent) for sent in sentences]
    return(sentences)


input_text = """
Tens of thousands of people are still expected to descend on Byron Bay this weekend, just days after a COVID-positive bachelorette party triggered alerts for a number of popular venues.
Contact tracing efforts have expanded wider across SEQ as authorities locked-down the Princess Alexandra Hospital and hinted at extended stay-at-home orders.
"""
print(text_preprocess(input_text))

[[('Tens', 'NNS'), ('of', 'IN'), ('thousands', 'NNS'), ('of', 'IN'), ('people', 'NNS'), ('are', 'VBP'), ('still', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('descend', 'VB'), ('on', 'IN'), ('Byron', 'NNP'), ('Bay', 'NNP'), ('this', 'DT'), ('weekend', 'NN'), (',', ','), ('just', 'RB'), ('days', 'NNS'), ('after', 'IN'), ('a', 'DT'), ('COVID-positive', 'JJ'), ('bachelorette', 'NN'), ('party', 'NN'), ('triggered', 'VBD'), ('alerts', 'NNS'), ('for', 'IN'), ('a', 'DT'), ('number', 'NN'), ('of', 'IN'), ('popular', 'JJ'), ('venues', 'NNS'), ('.', '.')], [('Contact', 'NNP'), ('tracing', 'VBG'), ('efforts', 'NNS'), ('have', 'VBP'), ('expanded', 'VBN'), ('wider', 'NN'), ('across', 'NN'), ('SEQ', 'NNP'), ('as', 'IN'), ('authorities', 'NNS'), ('locked-down', 'VBP'), ('the', 'DT'), ('Princess', 'NNP'), ('Alexandra', 'NNP'), ('Hospital', 'NNP'), ('and', 'CC'), ('hinted', 'VBD'), ('at', 'IN'), ('extended', 'VBN'), ('stay-at-home', 'JJ'), ('orders', 'NNS'), ('.', '.')]]


In [4]:
grammar = "NP Chunk: {<DT>?(<NN.?>*<IN>*)*<JJ>*<NN.?>*}"
cp = RegexpParser(grammar)
output_text = text_preprocess(input_text)
for tagged_text in output_text:
    result = cp.parse(tagged_text)
    print(result)
    #result.draw()
    for subtree in result.subtrees():
        if subtree.label() == 'NP Chunk': print(subtree)

(S
  (NP Chunk Tens/NNS of/IN thousands/NNS of/IN people/NNS)
  are/VBP
  still/RB
  expected/VBN
  to/TO
  descend/VB
  (NP Chunk on/IN Byron/NNP Bay/NNP)
  (NP Chunk this/DT weekend/NN)
  ,/,
  just/RB
  (NP Chunk days/NNS after/IN)
  (NP Chunk a/DT COVID-positive/JJ bachelorette/NN party/NN)
  triggered/VBD
  (NP Chunk alerts/NNS for/IN)
  (NP Chunk a/DT number/NN of/IN popular/JJ venues/NNS)
  ./.)
(NP Chunk Tens/NNS of/IN thousands/NNS of/IN people/NNS)
(NP Chunk on/IN Byron/NNP Bay/NNP)
(NP Chunk this/DT weekend/NN)
(NP Chunk days/NNS after/IN)
(NP Chunk a/DT COVID-positive/JJ bachelorette/NN party/NN)
(NP Chunk alerts/NNS for/IN)
(NP Chunk a/DT number/NN of/IN popular/JJ venues/NNS)
(S
  (NP Chunk Contact/NNP)
  tracing/VBG
  (NP Chunk efforts/NNS)
  have/VBP
  expanded/VBN
  (NP Chunk wider/NN across/NN SEQ/NNP as/IN authorities/NNS)
  locked-down/VBP
  (NP Chunk the/DT Princess/NNP Alexandra/NNP Hospital/NNP)
  and/CC
  hinted/VBD
  (NP Chunk at/IN)
  extended/VBN
  (NP Chunk 

In [5]:
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<VB.?|IN>+{      # Chink sequences of VBD and IN
  """
cp = RegexpParser(grammar)
for tagged_text in output_text:
    result = cp.parse(tagged_text)
    print(result)
    #result.draw()
    for subtree in result.subtrees():
        if subtree.label() == 'NP Chunk': print(subtree)

(S
  (NP Tens/NNS)
  of/IN
  (NP thousands/NNS)
  of/IN
  (NP people/NNS)
  are/VBP
  (NP still/RB)
  expected/VBN
  (NP to/TO)
  descend/VB
  on/IN
  (NP Byron/NNP Bay/NNP this/DT weekend/NN ,/, just/RB days/NNS)
  after/IN
  (NP a/DT COVID-positive/JJ bachelorette/NN party/NN)
  triggered/VBD
  (NP alerts/NNS)
  for/IN
  (NP a/DT number/NN)
  of/IN
  (NP popular/JJ venues/NNS ./.))
(S
  (NP Contact/NNP)
  tracing/VBG
  (NP efforts/NNS)
  have/VBP
  expanded/VBN
  (NP wider/NN across/NN SEQ/NNP)
  as/IN
  (NP authorities/NNS)
  locked-down/VBP
  (NP the/DT Princess/NNP Alexandra/NNP Hospital/NNP and/CC)
  hinted/VBD
  at/IN
  extended/VBN
  (NP stay-at-home/JJ orders/NNS ./.))


In [6]:
for tagged_text in output_text:
    result = ne_chunk(tagged_text, binary=True)
    for subtree in result.subtrees():
        if subtree.label() == 'NE': print(subtree)

(NE Byron/NNP Bay/NNP)
(NE Contact/NNP)
(NE Princess/NNP Alexandra/NNP Hospital/NNP)


In [7]:
import re
from nltk.corpus import ieer
from nltk.sem import extract_rels, rtuple 

IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in ieer.parsed_docs('NYT_19980315'):
    for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
