# Homework: Competitive Grammar Writing

In [75]:
from pcfg_parse_gen import Pcfg, PcfgGenerator, CkyParse
import nltk

def print_tree(tree_string):
    tree_string = tree_string.strip()
    tree = nltk.Tree.fromstring(tree_string)
    tree.pretty_print()

def draw_tree(tree_string):
    tree_string = tree_string.strip()
    tree = nltk.Tree.fromstring(tree_string)
    tree.draw()

## Parsing sentences with your grammar

While you are developing your grammar you should parse with your grammar both example sentences and samples from your and other grammars.

In [76]:
parse_gram = Pcfg(["S1.gr","S2.gr","Vocab.gr"])

#reading grammar file: S1.gr
#reading grammar file: S2.gr
#reading grammar file: Vocab.gr


In [77]:
parser = CkyParse(parse_gram, beamsize=0.00001)
ce, trees = parser.parse_sentences(["Arthur is the king ."])
print("-cross entropy: {}".format(ce))
for tree_string in trees:
    print_tree(tree_string)

(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT is) (NP (Det the) (Nbar (Noun king) ))) (Punc .))) )
-cross entropy: -3.7863679540999504
             TOP              
              |                
              S1              
   ___________|___             
  |              _VP          
  |            ___|________    
  |           VP           |  
  |       ____|___         |   
  |      |        NP       |  
  |      |     ___|___     |   
  NP     |    |      Nbar  |  
  |      |    |       |    |   
Proper VerbT Det     Noun Punc
  |      |    |       |    |   
Arthur   is  the     king  .  



#parsing: ['Arthur', 'is', 'the', 'king', '.']
#-cross entropy (bits/word): -3.78637


In [78]:
ce, trees = parser.parse_sentences(["five strangers are at the Round Table ."])
print("-cross entropy: {}".format(ce))
for tree_string in trees:
    print_tree(tree_string)

(TOP (S2 (_Misc (Misc five) (_Misc (Misc strangers) (_Misc (Misc are) (_Prep (Prep at) (_Det (Det the) (_Misc (Misc (_Round Round) (_Table Table)) (_Misc (Misc .) ))))))) ) )
-cross entropy: -9.807330330570931
        TOP                                             
         |                                               
         S2                                             
         |                                               
       _Misc                                            
  _______|_______                                        
 |             _Misc                                    
 |        _______|__________                             
 |       |                _Misc                         
 |       |        __________|_____                       
 |       |       |              _Prep                   
 |       |       |     ___________|_____                 
 |       |       |    |                _Det             
 |       |       |    |      ___________|__

#parsing: ['five', 'strangers', 'are', 'at', 'the', 'Round', 'Table', '.']
#-cross entropy (bits/word): -9.80733


Use `parse_file` to parse a file of sentences.

In [79]:
ce, trees = parser.parse_file('example_sentences.txt')
print("-cross entropy: {}".format(ce))

(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT is) (NP (Det the) (Nbar (Noun king) ))) (Punc .))) )
(TOP (S1 (NP (Proper Arthur) ) (_VP (VP (VerbT rides) (NP (Det the) (Nbar (Nbar (Noun horse) ) (PP (Prep near) (NP (Det the) (Nbar (Noun castle) )))))) (Punc .))) )
(TOP (S2 (_Misc (Misc riding) (_Misc (Misc to) (_Misc (Misc Camelot) (_VerbT (VerbT is) (_Misc (Misc hard) (_Misc (Misc .) )))))) ) )
(TOP (S2 (_Misc (Misc do) (_Misc (Misc coconuts) (_Misc (Misc speak) (_Misc (Misc ?) )))) ) )
(TOP (S2 (_Misc (Misc what) (_Misc (Misc does) (_Proper (Proper Arthur) (_Misc (Misc ride) (_Misc (Misc ?) ))))) ) )
(TOP (S2 (_Misc (Misc who) (_Misc (Misc does) (_Proper (Proper Arthur) (_Misc (Misc suggest) (_Misc (Misc she) (_Misc (Misc carry) (_Misc (Misc ?) ))))))) ) )
(TOP (S2 (_Misc (Misc why) (_Misc (Misc does) (_Misc (Misc England) (_Misc (Misc have) (_Det (Det a) (_Noun (Noun king) (_Misc (Misc ?) ))))))) ) )
(TOP (S2 (_Misc (Misc are) (_Misc (Misc they) (_Misc (Misc suggesting) (_Proper (Pr

#parsing: ['Arthur', 'is', 'the', 'king', '.']
#parsing: ['Arthur', 'rides', 'the', 'horse', 'near', 'the', 'castle', '.']
#parsing: ['riding', 'to', 'Camelot', 'is', 'hard', '.']
#parsing: ['do', 'coconuts', 'speak', '?']
#parsing: ['what', 'does', 'Arthur', 'ride', '?']
#parsing: ['who', 'does', 'Arthur', 'suggest', 'she', 'carry', '?']
#parsing: ['why', 'does', 'England', 'have', 'a', 'king', '?']
#parsing: ['are', 'they', 'suggesting', 'Arthur', 'ride', 'to', 'Camelot', '?']
#parsing: ['five', 'strangers', 'are', 'at', 'the', 'Round', 'Table', '.']
#parsing: ['Guinevere', 'might', 'have', 'known', '.']
#parsing: ['Guinevere', 'should', 'be', 'riding', 'with', 'Patsy', '.']
#parsing: ['it', 'is', 'Sir', 'Lancelot', 'who', 'knows', 'Zoot', '!']
#parsing: ['either', 'Arthur', 'knows', 'or', 'Patsy', 'does', '.']
#parsing: ['neither', 'Sir', 'Lancelot', 'nor', 'Guinevere', 'will', 'speak', 'of', 'it', '.']
#parsing: ['the', 'Holy', 'Grail', 'was', 'covered', 'by', 'a', 'yellow', 'fruit

## Generating sentences with your grammar

While you are developing your grammar you should generate sentences with your grammar
to check what your grammar is doing. Try to write your grammars to that it will 
generate hard to parse sentences.

In [80]:
gen_gram = Pcfg(["S1.gr","Vocab.gr"])

#reading grammar file: S1.gr
#reading grammar file: Vocab.gr


In [81]:
gen = PcfgGenerator(gen_gram)
for _ in range(20):
    print(" ".join(gen.generate()))

another sovereign has no sovereign
Sir Knight has that weight
any chalice is the quest
see
a swallow carries that coconut .
Arthur carries Patsy
this home carries every swallow .
no swallow is each servant on this fruit with the pound
no master at another land carries each horse
that story has another sovereign .
a castle carries no defeater .
that story drinks Guinevere
a castle is each coconut across any sun .
no chalice rides no swallow
this swallow has that winter
another story drinks each husk into every swallow
this winter below any coconut covers that horse .
no servant has no sun .
the servant is that husk on a swallow .
that chalice carries each horse


using nltk for pos tagging 

In [82]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [108]:
## CREATING NewVocab.gr

## have pos for already existing tokens
## add new tokens from allowed_words and get pos for them also.

import re 
vocab_file = open("Vocab.gr")
VOCAB_RULES=[]
new_vocab_file=open("NewVocab.gr",'w')
NEW_POS=[]

NEW_POS_LINE=[]

ALL_POS=[]

DELIMITERS=[" "*i for i in range(10)]

for line in vocab_file : 
    if line[0] != "#" and re.match(r'[0-9|\.]',line[0]) and line[0] != " " : ## A VALID GRAMMAR 
            
            REGEX=r"([0-9]*)\ +([A-Za-z_]*)\ +([A-Za-z_0-9\?\.\,\'\;'\!\"\--\-\:]*)"
            contents = re.findall(REGEX,line)[0]
            
            weight = contents[0]
            original_pos = contents[1]
            token = contents[2]
            
            if original_pos == "Misc" : 
                new_pos = nltk.pos_tag([token])[0][1]  
                new_line=re.sub(r'Misc',new_pos,line)
                NEW_POS.append(new_pos)
                ALL_POS.append(new_pos)
                #print(new_pos)
                #print(new_line)
                new_vocab_file.write(new_line)
                NEW_POS_LINE.append(new_line)
            else : 
                #print(line)
                new_vocab_file.write(line)
                NEW_POS_LINE.append(line)
                ALL_POS.append(original_pos)
    else : 
        #print(line)
        new_vocab_file.write(line)
        NEW_POS_LINE.append(line)


new_vocab_file.write("# NEW VOCABULARY FROM allowed_words.txt\n")

allowed_word_file = open("allowed_words.txt","r")       

for token in allowed_word_file : 
    
    token = token.strip()
    pos_token = nltk.pos_tag([token])[0][1]
    token_weight = 1
    #print("{}".format(pos_token))
    new_vocab_file.write("{}   {}   {}\n".format(token_weight,pos_token,token))
    
        

new_vocab_file.close()

## make the set of all part of speeches 
POS_set=set(ALL_POS)


In [126]:
## -- PENDING  : BIGRAM FOR S2

from nltk import bigrams, trigrams 
from collections import defaultdict
example_sents_file = open('example_sentences.txt','r')
sent_corpus = example_sents_file.readlines()
tokenize_text = [nltk.word_tokenize(sent) for sent in sent_corpus]
bigram_corpus = [list(bigrams(sent)) for sent in tokenize_text]


model = defaultdict(int)
pos_model = defaultdict(int)


## COMPUTING BIGRAMS
for i in range(len(bigram_corpus)) : 
    for w1, w2 in bigram_corpus[i]:
        model[w1,w2]+= 1
        
        pos_w1 = nltk.pos_tag([w1])[0][1]
        pos_w2 = nltk.pos_tag([w2])[0][1]
        
        
        ## ACTUALLY GET THE POS FROM THE VOCAB.GR
        pos_model[pos_w1,pos_w2]+=1
        

In [3]:
## ESTIMATING SINGLE TOKEN FREQUENCIES IN example_sents_file
## POPULATE THE df's freq from more sampling.

from nltk.probability import FreqDist
import numpy as np 
import pandas as pd 
example_sents_file = open('example_sentences.txt','r')
corpus = example_sents_file.readlines()
tokenized_corpus = [nltk.word_tokenize(sent) for sent in corpus ]
FLAT_TOKEN_LIST=[]
for a in tokenized_corpus : 
    for b in a : 
        FLAT_TOKEN_LIST.append(b)
        
vocab_dist = FreqDist(FLAT_TOKEN_LIST)
vocab_dist_df = pd.DataFrame(data=list(vocab_dist.values()),columns=['freq'])
vocab_dist_df['token'] = list(vocab_dist.keys())
vocab_dist_df['pos'] = vocab_dist_df.apply(lambda x: nltk.pos_tag([x['token']])[0][1],axis=1)

total_count = vocab_dist_df['freq'].sum()
vocab_dist_df['prob'] = vocab_dist_df.apply(lambda x: int(x['freq'])/total_count,axis=1 )


In [113]:
## CREATING S2_new.gr


## SCRIPT FOR NEW RULES IN S2.gr
## THE IDEA IS TO GENERATE a -> b c 
## THE WEIGHTS ARE KEPT ONE FOR SYMPLICITY NOW.

A_RULES = ["_{}".format(non_terminal) for non_terminal in POS_set ]
A_RULES_SPACE = []
A_RULES_SPACE.append("")

for i in range(len(A_RULES)) : 
    A_RULES_SPACE.append(A_RULES[i])


S2_ALL_RULES=[]
s2_new_file=open("S2_new.gr","w")


for a in A_RULES : 
    s2_new_file.write("1\tS2\t{}\n".format(a))
    
## CREATE S2_new.gr with 1 a b c rules
for a in A_RULES : 
    for b in POS_set : 
        for c in A_RULES_SPACE : 
            #print("1 {}\t{}\t{}".format(a,b,c))
            
            if c == "" : 
                S2_ALL_RULES.append("1\t{}\t{}\n".format(a,b))
                s2_new_file.write("1\t{}\t{}\n".format(a,b))
            else : 
                S2_ALL_RULES.append("1\t{}\t{}\t{}\n".format(a,b,c))
                s2_new_file.write("1\t{}\t{}\t{}\n".format(a,b,c))


    
    
s2_new_file.close()
#print(S2_ALL_RULES)

In [None]:

def update_vocab_weights(df,vocab_file) : 
    for line in vocab_file : 
        if line[0] != "#" and re.match(r'[0-9|\.]',line[0]) and line[0] != " " : ## A VALID GRAMMAR 
                
            REGEX=r"([0-9]*)\ +([A-Za-z_]*)\ +([A-Za-z_0-9\?\.\,\'\;'\!\"\--\-\:]*)"
            contents = re.findall(REGEX,line)[0]
            
            old_weight = contents[0]
            original_pos = contents[1]
            token = contents[2]
            
            ## SEARCH IN THE DF
            new_weight = df[df['token']==token]['freq']
            if new_weight is None or new_weight==0 : 
                new_weight = old_weight
            
            NEW_LINE = "{}\t{}\t{}"
                
        else : 
            
            vocab_file.write(line)
            
            

## Creating rules from devset.trees

In [12]:

import nltk
from nltk import Tree

def getTreesFromDevset(strFilename):
    treelist=[]
    onesentence=""
    with open(strFilename) as file:
        lines = file.readlines()
        # print(lines, "TYPE", type(lines))
        for currentline in lines:
            # print ("=WITHout PART:= " , currentline)
            currentline = currentline.partition("\n")[0]
            # print ("=WITH PART:= " , currentline)
            if currentline[0].isspace():
                onesentence = onesentence + " " + currentline.strip()
            else:
                if len(onesentence) > 0:
                    treelist[-1] = treelist[-1] + onesentence
                    treelist.append(currentline)
                    # treeset = treeset + onesentence + '\n' + currentline 
                    onesentence = ""
                else:
                    treelist.append(currentline) # = treeset + onesentence + '\n' + currentline 

    # eof (last tree) -- tedious algorithm but whatever (for now)
    if len(onesentence) > 0:
        treelist[-1] = treelist[-1] + onesentence
        onesentence = ""

    return treelist

# usage:  makeRulesFromTreeList(getTreesFromDevset("devset.trees"))
def makeRulesFromTreeList(lstGroupOfTrees, isDuplicate=False):
    lstRules=[]
    for item in lstGroupOfTrees:
        tr = Tree.fromstring(item)
        # lstRules.append(tr.productions())
        lstRules = lstRules + tr.productions()


    # make rules unique(no duplicates)
    if not isDuplicate:
        return list(set(lstRules))
    else:
        return lstRules

## Get rules using functions above:            
rules = makeRulesFromTreeList(getTreesFromDevset("devset.trees"))
for rule in rules:
    print(rule)
    
# NOTE: still not in CNF


NN -> 'simple'
VP -> VBN PP PP
INTJ -> UH .
RB -> 'Exactly'
S -> S , NP .
VB -> 'sp'
S -> INTJ , S , S .
NN -> 'anyone'
JJ -> 'reasonable'
NN -> 'working'
NN -> "d'"
RB -> 'Perhaps'
NNP -> 'Launcelot'
NN -> 'time'
VBN -> 'retold'
RB -> 'No'
NP -> VBG NNP
PRP$ -> 'her'
VP -> VB PP NP
RB -> 'Quite'
NNP -> 'Lake'
VBG -> 'perpetuating'
VBN -> 'stone'
VB -> 'use'
INTJ -> INTJ , INTJ
IN -> 'that'
VB -> 'carry'
VP -> VP CC RB VP
NP -> DT NNP .
VBP -> 'take'
SQ -> SQ , INTJ
ADJP -> VBD
RB -> 'individually'
INTJ -> JJ VP .
NNS -> 'climes'
NN -> 'winter'
SQ -> IN ADVP .
VP -> VB NP ADVP
VP -> VBP RB VP
VB -> 'think'
NNP -> 'B'
SQ -> VBZ NP SQ
S -> INTJ , INTJ , NP VP
RB -> 'again'
VBD -> 'knight'
NNS -> 'scales'
JJ -> 'strange'
NN -> 'water'
NP -> VBP
JJ -> 'watery'
NN -> 'inherent'
DT -> 'The'
JJ -> 'wooden'
ADVP -> RB :
NP -> NNP :
NN -> '"'
S -> NP `` NP ''
VP -> VBN NP PP
NNS -> 'wings'
PP -> IN PP
S -> S , NP VP .
VBP -> 'mean'
VB -> 'fly'
NN -> 'lady'
VP -> VB S NN SBAR
VP -> VB S , SQ
PP 

The above rules needs clean up:
- removing empty rules

After removing empty rules:

The terminal and nonterminal rules above have to be separated:
    - the output above is put into a textfile to be parsed by the code below:

In [16]:
# Separating rules and terminals from the devset.trees
# returns a tuple of lists (terminal list and rules list)
# NOTE: rules is still not in CNF
def getRulesAndTerminals(strFilename, isWriteToFile=False):
    terminals=[]
    rules=[]
    with open(strFilename) as file:
        lines = file.readlines()
        for currentline in lines:
            currentline = currentline.partition("\n")[0]
            if currentline[-1] == "'" or currentline[-1] == "\"":
                terminals.append(currentline)
            else:
                rules.append(currentline)


    terminals = list(set(terminals))
    rules = list(set(rules))
    # sort to be easier to read
    terminals.sort()
    rules.sort()

    file_devset_terminals="DevSetTerminals.txt"
    file_devset_rules="DevSetRules_NONCNF.txt"
    if isWriteToFile:
        with open(file_devset_terminals, "w") as file:
            for item in terminals:
                line = item + '\n'
                file.write(line)

        with open(file_devset_rules, "w") as file:
            for item in rules:
                line = item + '\n'
                file.write(line)


    return (terminals, rules)


When run:

In [21]:
# NOTE: rules is still not in CNF
terminals_and_rules = getRulesAndTerminals("rulesFromDevSetTrees_withoutdupl.txt")

print("*NOTE: Rules are still not in CNF*")
print("Rules count:", len(terminals_and_rules[1]))
print("Terminals count:", len(terminals_and_rules[0]))
print("Total:", (len(terminals_and_rules[1])+(len(terminals_and_rules[0]))))

print("\n-Devset RULES:-")
for item in terminals_and_rules[1]:
    print(item)
    
print("\n-Devset TERMINALS:-")
for item in terminals_and_rules[0]:
    print(item)


*NOTE: Rules are still not in CNF*
Rules count: 697
Terminals count: 1099
Total: 1796

-Devset RULES:-
1    ADJP -> ADJP : ADJP
1    ADJP -> ADJP CC DT JJS
1    ADJP -> ADJP NN
1    ADJP -> ADJP PP
1    ADJP -> CD : NNS
1    ADJP -> CD NN
1    ADJP -> DT NN
1    ADJP -> JJ
1    ADJP -> JJ , JJ .
1    ADJP -> JJ .
1    ADJP -> JJ CC JJ
1    ADJP -> JJ PP
1    ADJP -> JJ S
1    ADJP -> JJR
1    ADJP -> NN
1    ADJP -> NN PP
1    ADJP -> NP RB JJ
1    ADJP -> RB
1    ADJP -> RB JJ
1    ADJP -> RB RB
1    ADJP -> RB VB PP
1    ADJP -> RBR
1    ADJP -> VB
1    ADJP -> VBD
1    ADJP -> VBD RB
1    ADJP -> VBD SBAR
1    ADJP -> VBN
1    ADJP -> VBN JJ
1    ADJP -> VBP
1    ADJP -> VBP SBAR
1    ADVP -> ADVP NP
1    ADVP -> ADVP VP
1    ADVP -> DT
1    ADVP -> DT NN
1    ADVP -> DT RBR
1    ADVP -> EX
1    ADVP -> IN
1    ADVP -> IN DT
1    ADVP -> JJ
1    ADVP -> NN
1    ADVP -> NNP
1    ADVP -> RB
1    ADVP -> RB .
1    ADVP -> RB :
1    ADVP -> RB IN DT
1    ADVP -> RB RB
1    ADVP -> RP
1 