# Homework: Competitive Grammar Writing

In [1]:
from pcfg_parse_gen import Pcfg, PcfgGenerator, CkyParse
import nltk

def print_tree(tree_string):
    tree_string = tree_string.strip()
    tree = nltk.Tree.fromstring(tree_string)
    tree.pretty_print()

def draw_tree(tree_string):
    tree_string = tree_string.strip()
    tree = nltk.Tree.fromstring(tree_string)
    tree.draw()

## Parsing sentences with your grammar

While you are developing your grammar you should parse with your grammar both example sentences and samples from your and other grammars.

In [2]:
parse_gram = Pcfg(["S1.gr","S2.gr","Vocab.gr"])

#reading grammar file: S1.gr
#reading grammar file: S2.gr
#reading grammar file: Vocab.gr


In [5]:
parser = CkyParse(parse_gram, beamsize=0.00001)
ce, trees = parser.parse_sentences(["Arthur is the king ."])
print("-cross entropy: {}".format(ce))
for tree_string in trees:
    print_tree(tree_string)

(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT is) (NP (Det the) (Nbar (Noun king) ))) (Punc .))) )
-cross entropy: -3.7863679540999504
             TOP              
              |                
              S1              
   ___________|___             
  |              _VP          
  |            ___|________    
  |           VP           |  
  |       ____|___         |   
  |      |        NP       |  
  |      |     ___|___     |   
  NP     |    |      Nbar  |  
  |      |    |       |    |   
Proper VerbT Det     Noun Punc
  |      |    |       |    |   
Arthur   is  the     king  .  



#parsing: ['Arthur', 'is', 'the', 'king', '.']
#-cross entropy (bits/word): -3.78637


In [6]:
ce, trees = parser.parse_sentences(["five strangers are at the Round Table ."])
print("-cross entropy: {}".format(ce))
for tree_string in trees:
    print_tree(tree_string)

(TOP (S2 (_Misc (Misc five) (_Misc (Misc strangers) (_Misc (Misc are) (_Prep (Prep at) (_Det (Det the) (_Misc (Misc (_Round Round) (_Table Table)) (_Misc (Misc .) ))))))) ) )
-cross entropy: -9.807330330570931
        TOP                                             
         |                                               
         S2                                             
         |                                               
       _Misc                                            
  _______|_______                                        
 |             _Misc                                    
 |        _______|__________                             
 |       |                _Misc                         
 |       |        __________|_____                       
 |       |       |              _Prep                   
 |       |       |     ___________|_____                 
 |       |       |    |                _Det             
 |       |       |    |      ___________|__

#parsing: ['five', 'strangers', 'are', 'at', 'the', 'Round', 'Table', '.']
#-cross entropy (bits/word): -9.80733


Use `parse_file` to parse a file of sentences.

In [7]:
ce, trees = parser.parse_file('example_sentences.txt')
print("-cross entropy: {}".format(ce))

(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT is) (NP (Det the) (Nbar (Noun king) ))) (Punc .))) )
(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT rides) (NP (Det the) (Nbar (Nbar (Noun horse) ) (PP (Prep near) (NP (Det the) (Nbar (Noun castle) )))))) (Punc .))) )
(TOP (S2 (_Misc (Misc riding) (_Misc (Misc to) (_Misc (Misc Camelot) (_VerbT (VerbT is) (_Misc (Misc hard) (_Misc (Misc .) )))))) ) )
(TOP (S2 (_Misc (Misc do) (_Misc (Misc coconuts) (_Misc (Misc speak) (_Misc (Misc ?) )))) ) )
(TOP (S2 (_Misc (Misc what) (_Misc (Misc does) (_Proper (Proper Arthur) (_Misc (Misc ride) (_Misc (Misc ?) ))))) ) )
(TOP (S2 (_Misc (Misc who) (_Misc (Misc does) (_Proper (Proper Arthur) (_Misc (Misc suggest) (_Misc (Misc she) (_Misc (Misc carry) (_Misc (Misc ?) ))))))) ) )
(TOP (S2 (_Misc (Misc why) (_Misc (Misc does) (_Misc (Misc England) (_Misc (Misc have) (_Det (Det a) (_Noun (Noun king) (_Misc (Misc ?) ))))))) ) )
(TOP (S2 (_Misc (Misc are) (_Misc (Misc they) (_Misc (Misc suggesting) (_Proper (Pr

#parsing: ['Arthur', 'is', 'the', 'king', '.']
#parsing: ['Arthur', 'rides', 'the', 'horse', 'near', 'the', 'castle', '.']
#parsing: ['riding', 'to', 'Camelot', 'is', 'hard', '.']
#parsing: ['do', 'coconuts', 'speak', '?']
#parsing: ['what', 'does', 'Arthur', 'ride', '?']
#parsing: ['who', 'does', 'Arthur', 'suggest', 'she', 'carry', '?']
#parsing: ['why', 'does', 'England', 'have', 'a', 'king', '?']
#parsing: ['are', 'they', 'suggesting', 'Arthur', 'ride', 'to', 'Camelot', '?']
#parsing: ['five', 'strangers', 'are', 'at', 'the', 'Round', 'Table', '.']
#parsing: ['Guinevere', 'might', 'have', 'known', '.']
#parsing: ['Guinevere', 'should', 'be', 'riding', 'with', 'Patsy', '.']
#parsing: ['it', 'is', 'Sir', 'Lancelot', 'who', 'knows', 'Zoot', '!']
#parsing: ['either', 'Arthur', 'knows', 'or', 'Patsy', 'does', '.']
#parsing: ['neither', 'Sir', 'Lancelot', 'nor', 'Guinevere', 'will', 'speak', 'of', 'it', '.']
#parsing: ['the', 'Holy', 'Grail', 'was', 'covered', 'by', 'a', 'yellow', 'fruit

## Generating sentences with your grammar

While you are developing your grammar you should generate sentences with your grammar
to check what your grammar is doing. Try to write your grammars to that it will 
generate hard to parse sentences.

In [8]:
gen_gram = Pcfg(["S1.gr","Vocab.gr"])

#reading grammar file: S1.gr
#reading grammar file: Vocab.gr


In [9]:
gen = PcfgGenerator(gen_gram)
for _ in range(20):
    print(" ".join(gen.generate()))

Sir Knight has this weight
no sun covers this sun .
another fruit rides this winter .
each pound carries a pound
no horse has Zoot
that winter rides every fruit .
that fruit drinks every sovereign .
every home covers every sovereign
another castle has that sovereign .
that sovereign has every defeater
this servant carries another servant .
that master is another king
another sovereign drinks another sun .
each coconut drinks any home
any home drinks any castle .
the weight rides no sovereign .
that servant is another coconut .
another castle at each sun drinks a horse .
any land carries a pound .
every sun drinks no castle


using nltk for pos tagging 

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [113]:
import re 
vocab_file = open("Vocab.gr")
VOCAB_RULES=[]
new_vocab_file=open("NewVocab.gr",'w')

DELIMITERS=[" "*i for i in range(10)]

for line in vocab_file : 
    if line[0] != "#" and re.match(r'[0-9|\.]',line[0]) and line[0] != " " : ## A VALID GRAMMAR 
            
            REGEX=r"([0-9]*)\ +([A-Za-z_]*)\ +([A-Za-z_0-9\?\.\,\'\;'\!\"\--\-\:]*)"
            contents = re.findall(REGEX,line)[0]
            
            weight = contents[0]
            original_pos = contents[1]
            token = contents[2]

          
            
            if original_pos == "Misc" : 
                new_pos = nltk.pos_tag([token])[0][1]  
                new_line=re.sub(r'Misc',new_pos,line)
                print(new_line)
                new_vocab_file.write(new_line)
            else : 
                print(line)
                new_vocab_file.write(line)
    else : 
        print(line)
        new_vocab_file.write(line)

            
#print(VOCAB_RULES)

# This is a hand-generated set of preterminal rules.

# It specifies a simple tag set which will be handled by the S2

# grammar and which can (though doesn't necessarily need to be)

# handled by the S1 grammar.

#

# The weights given here may be changed, but no new vocabulary

# items outside of allowed_words.txt should be added.



# Miscellaneous items that don't fit into any of our simple tags are

# given to Misc.  You will want to develop your own tags for these!



# Punctuations

1   Punc    .



# Singular and mass nouns.  Notice that only the

# ones that start with consonants get our Noun tag.

1    Noun    castle

1    Noun    king

1    Noun    defeater

1    Noun    sovereign

1    Noun    servant

1    Noun    corner

1    Noun    land

1    Noun    quest

1    Noun    chalice

1    Noun    master

1    Noun    horse

1    Noun    fruit

1    Noun    swallow

1    Noun    sun

1    Noun    winter

1    Noun    coconut

1    Noun    pound

1    Noun    husk

1    Noun  

In [None]:
         
            weight = contents[0]
            original_pos = contents[1]
            token = contents[2]
            
            if original_pos=="Misc" : 
                new_pos = nltk.pos_tag(token)
                print(new_pos)

In [64]:
print(DELIMITERS)

['', ' ', '  ', '   ', '    ', '     ', '      ', '       ', '        ', '         ']
