In [1]:
import nltk

In [2]:
def ie_preprocess(document):
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]

In [3]:
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"), ("dog", "NN"), ("barked", "VBD"), ("at", "IN"),  ("the", "DT"), ("cat", "NN")]
grammar = "NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammar)
result = cp.parse(sentence)
print(result)

(S
  (NP the/DT little/JJ yellow/JJ dog/NN)
  barked/VBD
  at/IN
  (NP the/DT cat/NN))


In [2]:
from nltk.corpus import conll2000 as cl

In [8]:
print(cl.chunked_sents("train.txt", chunk_types=['NP'])[99])  
# chunk_types parameter can narrow down the range of chunk

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


In [9]:
cp = nltk.RegexpParser("")
test_sents = cl.chunked_sents("test.txt", chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


This means 43.4% of the words are not in NP chunks.

In [10]:
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  87.7%%
    Precision:     70.6%%
    Recall:        67.8%%
    F-Measure:     69.2%%


In [11]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]\
                     for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)\
                    in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [3]:
train_sents = cl.chunked_sents("train.txt", chunk_types=['NP'])
test_sents = cl.chunked_sents("test.txt", chunk_types=['NP'])
# unigram_chunker = UnigramChunker(train_sents)
# print(unigram_chunker.evaluate(test_sents))

In [14]:
# To check what this chunker actually does
postags = sorted(set(pos for sent in train_sents for (word, pos)\
                    in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))

[('#', 'B-NP'), ('$', 'B-NP'), ("''", 'O'), ('(', 'O'), (')', 'O'), (',', 'O'), ('.', 'O'), (':', 'O'), ('CC', 'O'), ('CD', 'I-NP'), ('DT', 'B-NP'), ('EX', 'B-NP'), ('FW', 'I-NP'), ('IN', 'O'), ('JJ', 'I-NP'), ('JJR', 'B-NP'), ('JJS', 'I-NP'), ('MD', 'O'), ('NN', 'I-NP'), ('NNP', 'I-NP'), ('NNPS', 'I-NP'), ('NNS', 'I-NP'), ('PDT', 'B-NP'), ('POS', 'B-NP'), ('PRP', 'B-NP'), ('PRP$', 'B-NP'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'B-NP'), ('RP', 'O'), ('SYM', 'O'), ('TO', 'O'), ('UH', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'B-NP'), ('WP', 'B-NP'), ('WP$', 'B-NP'), ('WRB', 'O'), ('``', 'O')]


In [19]:
class BigramChunker(nltk.ChunkParserI):
    
    def __init__(self, train_sents):
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]\
                     for sent in train_sents]
        self.t1 = nltk.UnigramTagger(train_data)
        self.tagger = nltk.BigramTagger(train_data, backoff=self.t1)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)\
                    in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [20]:
bigram_chunker = BigramChunker(train_sents)

In [23]:
print(bigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  93.4%%
    Precision:     82.3%%
    Recall:        87.0%%
    F-Measure:     84.6%%


In [4]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):
    
    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append((featureset, tag))
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(\
            train_set, algorithm='megam', trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w, t), c) \
            for w, t, c in nltk.chunk.tree2conlltags(sent)]\
            for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)


In [5]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos" : pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))

LookupError: 

===========================================================================
NLTK was unable to find the megam file!
Use software specific configuration paramaters or set the MEGAM environment variable.

  For more information on megam, see:
    <http://users.umiacs.umd.edu/~hal/megam/index.html>
===========================================================================

-> Windows: I no longer have access to a Windows machine...sorry, download the source.

says the hompage...

In [6]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), \
            ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]

In [8]:
print(cp.parse(sentence))

(S
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


In [9]:
sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), 
            ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), 
            ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))

(S
  (NP John/NNP)
  thinks/VBZ
  (NP Mary/NN)
  saw/VBD
  (CLAUSE
    (NP the/DT cat/NN)
    (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))


-> This parser misses VP for "saw" and "thinks"

In [11]:
cp = nltk.RegexpParser(grammar, loop=3)
print(cp.parse(sentence))

(S
  (CLAUSE
    (NP John/NNP)
    (VP
      thinks/VBZ
      (CLAUSE
        (NP Mary/NN)
        (VP
          saw/VBD
          (CLAUSE
            (NP the/DT cat/NN)
            (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))))


In [6]:
tree1 = nltk.Tree("NP", ["Alice"])
print(tree1)

(NP Alice)


In [5]:
tree2 = nltk.Tree("NP", ["The", "rabbit"])
print(tree2)

(NP The rabbit)


In [7]:
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)

(S (NP Alice) (VP chased (NP The rabbit)))


In [12]:
print(tree4[1][1])

(NP The rabbit)


In [13]:
tree4.draw()

In [14]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        print("(", t.label(), end=" ")
        for child in t:
            traverse(child)
        print(")", end=" ")
        

In [19]:
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=False))

(S
  The/DT
  (GPE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (PERSON Brooke/NNP T./NNP Mossman/NNP)
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (ORGANIZATION University/NNP)
  of/IN
  (PERSON Vermont/NNP College/NNP)
  of/IN
  (GPE Medicine/NNP)
  ./.)


In [20]:
import re

In [21]:
IN = re.compile(r".*\bin\b(?!\b.+ing\b)")

In [22]:
for doc in nltk.corpus.ieer.parsed_docs("NYT_19980315"):
    for rel in nltk.sem.extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']


In [23]:
from nltk.corpus import conll2002

In [27]:
vnv = """
(
is/V|
was/V|
werd/V|
wordt/V|
)
.*
van/Prep
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents("ned.train"):
    for r in nltk.sem.extract_rels("PER", "ORG", doc, corpus="conll2002", pattern=VAN):
        print(nltk.sem.clause(r, relsym="VAN"))

VAN('marco_pantani', 'mercatone_uno')
VAN('larmuseau', 'abc_containerline')
VAN('horst_köhler', 'imf')
VAN('simonet', 'binnenlandse_zaken')
VAN('guy_quaden', 'nationale_bank')
VAN('de_bauw', 'buitenlandse_zaken')
VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('rosenfeld', 'abc_containerline')
VAN('carlo_gepts', 'vt4')
VAN('lone_leth_larsen', 'deens_cultureel_centrum')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('jean-louis_peninou', 'international_boundaries_research')
VAN('lieven', 'honda')
VAN('talal_g_shamoon', 'intertrust_technologies_corporation')
VAN('albert_frère', 'tractebel')
VAN('robert_spatz', 'okc-beweging')
VAN('bart_bode', 'broederlijk_delen')
VAN('guido_westerwelle', 'fdp')
VAN('martin_bril', 'vrij_nederland')
VAN('frank_rijkaard', 'vrij_nederland')
VAN('filip', 'telecommunicatie')
VAN('maurice_buckmaster', 'special_operations_executive')
VAN('mukamba', 'commissie-lumumba')
VAN('versnick', 'buitenlandse_zaken')
VAN('mukamba', 'miba')
VAN('bart_bode', 'br

In [29]:
for doc in conll2002.chunked_sents("ned.train"):
    for rel in nltk.sem.extract_rels("PER", "ORG", doc, corpus="conll2002", pattern=VAN):
        print(nltk.sem.rtuple(rel, lcon=True, rcon=True))

...'De/Art ploegmaat/N van/Prep')[PER: 'Marco/N Pantani/N'] 'en/Conj kopman/N van/Prep' [ORG: 'Mercatone/N Uno/N']('in/Prep deze/Pron'...
...'In/Prep dezelfde/Pron periode/N was/V')[PER: 'Larmuseau/N'] 'ook/Adv lid/N van/Prep de/Art interkabinettengroep/N rond/Prep' [ORG: 'ABC/N Containerline/N'](',/Punc die/Pron specifiek/Adj was/V opgericht/V'...
...'Dit/Pron heeft/V')[PER: 'Horst/N Köhler/Conj'] ',/Punc de/Art in/Prep mei/N aangetreden/V topman/N van/Prep het/Art' [ORG: 'IMF/N'](',/Punc gisteren/Adv gezegd/V in/Prep'...
...'')[PER: 'Simonet/N'] 'heeft/V de/Art bekommernissen/N overgemaakt/V aan/Prep minister/N van/Prep' [ORG: 'Binnenlandse/N Zaken/N'](''...
...'De/Art woorden/N komen/V van/Prep gouverneur/N')[PER: 'Guy/N Quaden/N'] 'van/Prep de/Art' [ORG: 'Nationale/Adj Bank/N'](',/Punc en/Conj ze/Pron werden/V woensdag/N'...
...'')[PER: 'De/Art Bauw/N'] '(/Punc 34/Num )/Punc was/V vroeger/Adj adjunct-woordvoerder/N van/Prep het/Art ministerie/N van/Prep' [ORG: 'Buitenlandse/N Zaken