In [1]:
import nltk

In [2]:
groucho_grammar=nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N |  Det N PP | 'I'
VP -> V NP |VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")                         
sent='I shot an elephant in my pajamas'.split()
parser=nltk.ChartParser(groucho_grammar)
trees=parser.parse(sent)
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V shot) (NP (Det an) (N elephant)))
    (PP (P in) (NP (Det my) (N pajamas)))))
(S
  (NP I)
  (VP
    (V shot)
    (NP (Det an) (N elephant) (PP (P in) (NP (Det my) (N pajamas))))))


In [3]:
grammar1=nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> 'saw' | 'ate' | 'walked'
NP ->  'John' | 'Mary' | 'Bob' | Det N | Det N PP
Det -> 'a' | 'an' | 'the' | 'my'
N -> 'man' | 'dog' | 'cat' | 'telescope' | 'park'
P -> 'in' | 'on' | 'by' | 'with'
""")

In [4]:
def parse(sent,grammar):
    rd_parser=nltk.parse.RecursiveDescentParser(grammar)
    for tree in rd_parser.parse(sent):
        print(tree)
sent="the dog saw a man in the park".split()
parse(sent,grammar1)

(S
  (NP (Det the) (N dog))
  (VP
    (V saw)
    (NP (Det a) (N man) (PP (P in) (NP (Det the) (N park))))))
(S
  (NP (Det the) (N dog))
  (VP
    (V saw)
    (NP (Det a) (N man))
    (PP (P in) (NP (Det the) (N park)))))


In [5]:
grammar2=nltk.CFG.fromstring("""
S -> NP VP
NP -> Det Nom | propN
Nom -> Adj Nom | N
VP -> V Adj | V NP | V S | V NP PP
PP -> P NP
propN -> 'Buster' | 'Chatterer' | 'Joe'
Det -> 'the' | 'a'
N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'
Adj -> 'angry' | 'frightened' | 'little' | 'tall'
V -> 'chased' | 'saw' | 'said' | 'thought' | 'was' | 'put'
P -> 'on'
""")

In [6]:
sent='the angry bear chased the frightened little squirrel'.split()
parse(sent,grammar2)

(S
  (NP (Det the) (Nom (Adj angry) (Nom (N bear))))
  (VP
    (V chased)
    (NP
      (Det the)
      (Nom (Adj frightened) (Nom (Adj little) (Nom (N squirrel)))))))


In [7]:
sent='Chatterer said Buster thought the tree was tall'.split()
parse(sent,grammar2)

(S
  (NP (propN Chatterer))
  (VP
    (V said)
    (S
      (NP (propN Buster))
      (VP
        (V thought)
        (S (NP (Det the) (Nom (N tree))) (VP (V was) (Adj tall)))))))


In [8]:
nltk.app.rdparser()

In [9]:
nltk.app.srparser()



In [10]:
def init_wfst(tokens,grammar):
    numtokens=len(tokens)
    wfst=[[None for i in range(numtokens+1)] for j in range(numtokens+1)]
    for i in range(numtokens):
        productions=grammar.productions(rhs=tokens[i])
        wfst[i][i+1]=productions[0].lhs()
    return wfst
def complete_wfst(wfst,tokens,grammar,trace=False):
    index=dict((p.rhs(),p.lhs()) for p in grammar.productions())
    numtokens=len(tokens)
    for span in range(2,numtokens+1):
        for start in range(numtokens+1-span):
            end=start+span
            for mid in range(start+1,end):
                nt1,nt2=wfst[start][mid],wfst[mid][end]
                if nt1 and nt2 and (nt1,nt2) in index:
                    wfst[start][end]=index[(nt1,nt2)]
                    if trace:
                        print("[%s] %3s [%s] %3s [%s] ==> [%s] %3s [%s]"%\
                             (start,nt1,mid,nt2,end,start,index[(nt1,nt2)],end))
    return wfst

In [11]:
def display(wfst,tokens):
    print("WFST"+" ".join([("%-4d"%i) for i in range(1,len(wfst))]))
    for i in range(len(wfst)-1):
        print(" %d  "%i,end='')
        for j in range(1,len(wfst)):
            print("%-5s"%(wfst[i][j] or '.'),end='')
        print()

In [12]:
tokens="I shot an elephant in my pajamas".split()
wfst0=init_wfst(tokens,groucho_grammar)
display(wfst0,tokens)

WFST1    2    3    4    5    6    7   
 0  NP   .    .    .    .    .    .    
 1  .    V    .    .    .    .    .    
 2  .    .    Det  .    .    .    .    
 3  .    .    .    N    .    .    .    
 4  .    .    .    .    P    .    .    
 5  .    .    .    .    .    Det  .    
 6  .    .    .    .    .    .    N    


In [13]:
wfst1=complete_wfst(wfst0,tokens,groucho_grammar)
display(wfst1,tokens)

WFST1    2    3    4    5    6    7   
 0  NP   .    .    S    .    .    S    
 1  .    V    .    VP   .    .    VP   
 2  .    .    Det  NP   .    .    .    
 3  .    .    .    N    .    .    .    
 4  .    .    .    .    P    .    PP   
 5  .    .    .    .    .    Det  NP   
 6  .    .    .    .    .    .    N    


In [14]:
wfst1=complete_wfst(wfst0,tokens,groucho_grammar,trace=True)

[2] Det [3]   N [4] ==> [2]  NP [4]
[5] Det [6]   N [7] ==> [5]  NP [7]
[1]   V [2]  NP [4] ==> [1]  VP [4]
[4]   P [5]  NP [7] ==> [4]  PP [7]
[0]  NP [1]  VP [4] ==> [0]   S [4]
[1]  VP [4]  PP [7] ==> [1]  VP [7]
[0]  NP [1]  VP [7] ==> [0]   S [7]


In [15]:
groucho_dep_grammar=nltk.DependencyGrammar.fromstring("""
'shot' -> 'I' | 'elephant' | 'in'
'elephant' -> 'an' | 'in'
'in' -> 'pajamas'
'pajamas' -> 'my'
""")

In [16]:
print(groucho_dep_grammar)

Dependency grammar with 7 productions
  'shot' -> 'I'
  'shot' -> 'elephant'
  'shot' -> 'in'
  'elephant' -> 'an'
  'elephant' -> 'in'
  'in' -> 'pajamas'
  'pajamas' -> 'my'


In [17]:
pdp=nltk.ProjectiveDependencyParser(groucho_dep_grammar)
sent="I shot an elephant in my pajamas".split()
trees=pdp.parse(sent)
for tree in trees:
    print(tree)

(shot I (elephant an (in (pajamas my))))
(shot I (elephant an) (in (pajamas my)))


In [18]:
from nltk.corpus import treebank
t=treebank.parsed_sents('wsj_0001.mrg')[0]
print(t)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


In [None]:
def filter(tree):
    child_nodes=[child.label() for child in tree if isinstance(child,nltk.Tree)]
    return (tree.label()=='VP') and ('S' in child_nodes)
[subtree for tree in treebank.parsed_sents()  for subtree in tree.subtrees(filter)]

[Tree('VP', [Tree('VBN', ['named']), Tree('S', [Tree('NP-SBJ', [Tree('-NONE-', ['*-1'])]), Tree('NP-PRD', [Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])]), Tree('PP', [Tree('IN', ['of']), Tree('NP', [Tree('DT', ['this']), Tree('JJ', ['British']), Tree('JJ', ['industrial']), Tree('NN', ['conglomerate'])])])])])]),
 Tree('VP', [Tree('VBD', ['said']), Tree(',', [',']), Tree('``', ['``']), Tree('S', [Tree('NP-SBJ', [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP-PRD', [Tree('DT', ['an']), Tree('JJ', ['old']), Tree('NN', ['story'])])])])]),
 Tree('VP', [Tree('VBD', ['said']), Tree('S', [Tree('-NONE-', ['*T*-1'])])]),
 Tree('VP', [Tree('VBN', ['expected']), Tree('S', [Tree('-NONE-', ['*?*'])])]),
 Tree('VP', [Tree('VBD', ['said']), Tree('S', [Tree('-NONE-', ['*T*-1'])])]),
 Tree('VP', [Tree('VBZ', ['appears']), Tree('S', [Tree('NP-SBJ', [Tree('-NONE-', ['*-1'])]), Tree('VP', [Tree('TO', ['to']), Tree('VP', [Tree('VB', ['be']), Tree('

In [None]:
 nltk.download('ppattach')
entries=nltk.corpus.ppattach.attachments('training')
table=nltk.defaultdict(lambda:nltk.defaultdict(set))
for entry in entries:
    key=entry.noun1+'-'+entry.prep+'-'+entry.noun2
    table[key][entry.attachment].add(entry.verb)
for key in sorted(table):
    if len(table[key])>1:
        print(key,'N:',sorted(table[key]['N']),'V:',sorted(table[key]['V']))

[nltk_data] Downloading package ppattach to
[nltk_data]     C:\Users\Charmander\AppData\Roaming\nltk_data...
[nltk_data]   Package ppattach is already up-to-date!


In [None]:
import nltk
nltk.corpus.sinica_treebank.parsed_sents()[3450].draw()

In [None]:
def give(t):
    return t.label()=='VP' and len(t)>2 and t[1].label()=='NP'\
            and (t[2].label()=='PP-DTV' or t[2].label()=='NP')\
            and ('give' in t[0].leaves() or 'gave' in t[0].leaves())
def sent(t):
    return ' '.join(token for token in t.leaves() if token[0] not in '*-0')
def print_node(t,width):
    output="%s %s: %s / %s: %s"%\
        (sent(t[0]),t[1].label(),sent(t[1]),t[2].label(),sent(t[2]))
    if len(output)>width:
        output=output[:width]+'...'
    print(output)
for tree in nltk.corpus.treebank.parsed_sents():
    for t in tree.subtrees(give):
        print_node(t,72)

In [None]:
grammar=nltk.PCFG.fromstring("""
S -> NP VP [1.0]
VP -> TV NP [0.4]
VP -> IV [0.3]
VP -> DatV NP NP [0.3]
TV -> 'saw' [1.0]
IV -> 'ate' [1.0]
DatV -> 'gave' [1.0]
NP -> 'telescopes' [0.8]
NP -> 'Jack' [0.2]
""")

In [None]:
print(grammar)

In [None]:
viterbi_parser=nltk.ViterbiParser(grammar)
print(list(map(str,list(viterbi_parser.parse('Jack saw telescopes'.split())))))