In [1]:
import re
import random
from collections import ChainMap

#### Create grammar with terminal, non-terminal hash

In [2]:
non_terminal_pattern = r"^[A-Z]+$"
pre_terminal_pattern = r"^[A-Za-z]{2,}$"
terminal_pattern = r"(^[a-z\s]+)$"

def _spit_probable_choice(items, weights):
    return random.choices(items, weights, k=1)[0]

grammar_text = open("grammar.gr", "rb").read().decode("utf-8")


def _is_non_terminal(text):
    return bool(re.search(non_terminal_pattern, text))

def _is_pre_terminal(text):
    return bool(re.search(pre_terminal_pattern, text))

def _is_terminal(text):
    return bool(re.search(terminal_pattern, text))

def _create_grammar_hash(grammar_text):
    def _is_comment(line):
        if re.search(r"^[#\s()]", line):
            return 1

    valid_symbols = list(filter(lambda text: len(text)>1 and not _is_comment(text), grammar_text.split("\n")))
    cleaned_valid_symbols = list(map(lambda line: line.split("#")[0].strip().split("\t"), valid_symbols))

    non_terminal_hash = {}
    terminal_hash = {}
    for symbol in cleaned_valid_symbols:
        if _is_terminal(symbol[2]):
            if symbol[1] not in terminal_hash:
                terminal_hash[symbol[1]] = {str(symbol[2]): float(symbol[0])}
                continue
            terminal_hash[symbol[1]].update({str(symbol[2]): float(symbol[0])})
            continue
        else:
            print(symbol)
            if symbol[1] not in non_terminal_hash:
                non_terminal_hash[symbol[1]] = {tuple(map(str, symbol[2].split())): float(symbol[0])}
                continue
            non_terminal_hash[symbol[1]].update({tuple(map(str, symbol[2].split())): float(symbol[0])})
    merged = non_terminal_hash.copy()
    merged.update(terminal_hash)
    return terminal_hash, non_terminal_hash, merged

#### Create a combined grammar hash

In [None]:
non_terminal_pattern = r"^[A-Z]+$"
pre_terminal_pattern = r"^[A-Za-z]{2,}$"
terminal_pattern = r"(^[a-z\s]+)$"

def _spit_probable_choice(items, weights):
    return random.choices(items, weights, k=1)[0]

grammar_text = open("grammar.gr", "rb").read().decode("utf-8")


def _is_non_terminal(text):
    return bool(re.search(non_terminal_pattern, text))

def _is_pre_terminal(text):
    return bool(re.search(pre_terminal_pattern, text))

def _is_terminal(text):
    return bool(re.search(terminal_pattern, text))

def _create_grammar_hash(grammar_text):
    def _is_comment(line):
        if re.search(r"^[#\s()]", line):
            return 1

    valid_symbols = list(filter(lambda text: len(text)>1 and not _is_comment(text), grammar_text.split("\n")))
    cleaned_valid_symbols = list(map(lambda line: line.split("#")[0].strip().split("\t"), valid_symbols))

    grammar_hash = {}
    for symbol in cleaned_valid_symbols:
        if _is_terminal(symbol[2]):        
            if symbol[1] not in grammar_hash:
                grammar_hash[symbol[1]] = {str(symbol[2]): float(symbol[0])}
                continue
            grammar_hash[symbol[1]].update({str(symbol[2]): float(symbol[0])})
            continue
        else:
            if symbol[1] not in grammar_hash:
                grammar_hash[symbol[1]] = {tuple(map(str, symbol[2].split())): float(symbol[0])}
                continue
            grammar_hash[symbol[1]].update({tuple(map(str, symbol[2].split())): float(symbol[0])})
            continue
    return grammar_hash

In [5]:
grammar_hash = _create_grammar_hash(grammar_text)

#### Grammar Code

In [16]:
class Grammar:
    def __init__(self, grammar_file):
        """
        Context-Free Grammar (CFG) Sentence Generator

        Args:
            grammar_file (str): Path to a .gr grammar file

        Returns:
            self
        """
        # Parse the input grammar file
        self.rules = None
        self._load_rules_from_file(grammar_file)

    @staticmethod
    def _select_probable_choice(items, weights):
        return random.choices(items, weights, k=1)[0]

    @staticmethod
    def _convert_tokens_to_sentence(tokens):
        sentence = " ".join(tokens)
        sentence = re.sub(r"(?<!\.)\s+([.!?])", r"\1", sentence)
        sentence = re.sub(r"\s+'", "'", sentence)
        return sentence

    @staticmethod
    def _is_terminal(text):
        return bool(re.search(terminal_pattern, text))

    def _load_rules_from_file(self, grammar_file):
        """
        Read grammar file and store its rules in self.rules

        Args:
            grammar_file (str): Path to the raw grammar file
        """

        def _is_comment(line):
            if re.search(r"^[#\s()]", line):
                return 1

        grammar_text = open(grammar_file, "rb").read().decode("utf-8")
        
        valid_symbols = list(
            filter(
                lambda text: len(text) > 1 and not _is_comment(text),
                grammar_text.split("\n"),
            )
        )
        cleaned_valid_symbols = list(
            map(lambda line: line.split("#")[0].strip().split("\t"), valid_symbols)
        )

        grammar_hash = {}
        for symbol in cleaned_valid_symbols:
            if self._is_terminal(symbol[2]):
                if symbol[1] not in grammar_hash:
                    grammar_hash[symbol[1]] = {str(symbol[2]): float(symbol[0])}
                    continue
                grammar_hash[symbol[1]].update({str(symbol[2]): float(symbol[0])})
                continue
            else:
                if symbol[1] not in grammar_hash:
                    grammar_hash[symbol[1]] = {
                        tuple(map(str, symbol[2].split())): float(symbol[0])
                    }
                    continue
                grammar_hash[symbol[1]].update(
                    {tuple(map(str, symbol[2].split())): float(symbol[0])}
                )
                continue
        self.rules = grammar_hash

    def sample(self, derivation_tree, max_expansions, start_symbol):
        """
        Sample a random sentence from this grammar

        Args:
            derivation_tree (bool): if true, the returned string will represent
                the tree (using bracket notation) that records how the sentence
                was derived
            max_expansions (int): max number of nonterminal expansions we allow

            start_symbol (str): start symbol to generate from

        Returns:
            str: the random sentence or its derivation tree
        """
        if start_symbol not in self.rules:
            return start_symbol.strip()

        def _is_nonterminal(sym):
            return sym.isupper()
        
        n_expansions = 0
        def _tree_expand(symbol):
            nonlocal n_expansions

            if symbol not in self.rules:
                tokens = symbol.split()
                return tokens, symbol

            if _is_nonterminal(symbol):
                n_expansions += 1
                if n_expansions > max_expansions:
                    return ["..."], f"({symbol} ...)"

            items = list(self.rules[symbol].keys())
            weights = list(self.rules[symbol].values())
            right_split = self._select_probable_choice(items=items, weights=weights)

            if isinstance(right_split, (tuple, list)):
                tokens_list, subtrees = [], []                
                for daughter in right_split:
                    if (daughter in self.rules) and \
                        (_is_nonterminal(daughter)) and \
                            (n_expansions >= max_expansions):
                        tokens_list.append("...")
                        subtrees.append("...")
                        continue
                    tokens, subtree = _tree_expand(daughter)
                    tokens_list.extend(tokens)
                    subtrees.append(subtree)
                return tokens_list, f"({symbol} {' '.join(subtrees)})"
            else:
                return right_split.split(), f"({symbol} {right_split})"
        
        tokens, tree_str = _tree_expand(start_symbol)
        return tree_str if derivation_tree else self._convert_tokens_to_sentence(tokens)


In [17]:
grammar = Grammar(grammar_file="grammar.gr")
grammar.sample(derivation_tree=True, max_expansions=3, start_symbol="ROOT")

'(ROOT (S (NP ... ...) ...) .)'

In [24]:
for i in range(3):
    print(f"{i+1}. {grammar.sample(derivation_tree=False, max_expansions=450, start_symbol='ROOT')}")

1. every pickle in every delicious pickle in the pickled chief of staff ate every sandwich on every sandwich under every floor on a pickle in a floor under the chief of staff on the floor on a president on a pickle!
2. is it true that a pickle on the pickle with every pickle kissed every floor?
3. every chief of staff with a pickle in a pickle in a floor in every president under the sandwich on a floor under every president in the pickle under the delicious sandwich in the sandwich under the president on the chief of staff in every president on every fine perplexed pickle under a pickle with the chief of staff understood a floor on the delicious sandwich with the pickle with the pickled floor!


In [31]:
import os

def pprinter(sentence):
    prettyprint_path = os.path.join(os.getcwd(), "prettyprint")
    t = os.system(f"echo '{sentence}' | perl {prettyprint_path}")

In [38]:
pprinter(sentence = "(ROOT (S (NP (PropNoun it)) (VP (VP (Verb understood) (NP (NP (Det the) (Noun proposal)) (PP (Prep under) (NP (NP (PropNoun Sally)) (PP (Prep under) (NP (PropNoun Sally))))))) (Conj and) (VP (Verb kissed) (PropNoun Sally)))) !)")

(ROOT (S (NP (PropNoun it))
         (VP (VP (Verb understood)
                 (NP (NP (Det the)
                         (Noun proposal))
                     (PP (Prep under)
                         (NP (NP (PropNoun Sally))
                             (PP (Prep under)
                                 (NP (PropNoun Sally)))))))
             (Conj and)
             (VP (Verb kissed)
                 (PropNoun Sally))))
      !)


## 3.3.5

In [1]:
import numpy as np

In [2]:
print('''(ROOT (S (NP (Det the)
             (Noun president))
         (VP (Verb ate)
             (NP (Det the)
                 (Noun sandwich))))
      .)''')

(ROOT (S (NP (Det the)
             (Noun president))
         (VP (Verb ate)
             (NP (Det the)
                 (Noun sandwich))))
      .)


In [3]:
from itertools import chain
import subprocess

In [4]:
def calculate_corpus_size(corpus):
    return len(list(chain(*list(map(lambda sent: sent.split(), corpus)))))

In [5]:
def get_corpus(grammar, n_sentences:int):
    p = subprocess.run(f"python3 randsent.py -g {grammar} -n {n_sentences}", 
                       shell=True, text=True, 
                       capture_output=True)
    sentences = p.stdout.split("\n")
    return list(filter(lambda sent: len(sent) > 0, sentences))


def get_probabilities(sentence, grammar_file):
    p = subprocess.run(f"./parse -g {grammar_file} -P", shell=True, capture_output=True, text=True,
                    input=sentence)
    response = p.stdout
    return response


def get_prob_sentence(sentence: str, grammar="extra-grammars/grammar2.gr"):
    sentence_probabilities = list(map(lambda sent: sent.strip(), 
                                      get_probabilities(sentence=sentence, grammar_file=grammar).split("#")))
    return {
        "tree": sentence_probabilities[0],
        "P(best_parse)": float(sentence_probabilities[1].split("=")[1].strip()),
        "P(sentence)": float(sentence_probabilities[2].split("=")[1].strip()),
        "P(best_parse | sentence)": float(sentence_probabilities[3].split("=")[1].strip()),
        "cross_entropy": sentence_probabilities[4]
    }
    

def get_cross_entropy(corpus=[], grammar="extra-grammars/grammar2.gr", n_sentences=100):
    if len(corpus) < 1:
        corpus = get_corpus(grammar=grammar, n_sentences=n_sentences)
    grammar_corpus_size = calculate_corpus_size(corpus)
    # sum_log_probalities = sum(list(map(lambda sent: math.log2(get_prob_sentence(sentence=sent, grammar=grammar)["P(sentence)"]), 
    #                                    corpus)))
    
    corp_probabilities = []
    for sent in corpus:
        try:
            corp_probabilities.append(math.log2(get_prob_sentence(sentence=sent, grammar=grammar)["P(sentence)"]))
        except:
            print(sent)
    sum_log_probalities = sum(corp_probabilities)
    
    cross_entropy = -(sum_log_probalities / grammar_corpus_size)
    return cross_entropy

In [37]:
corpus = ["Sally ate a sandwich .",
"Sally and the president wanted and ate a sandwich .",
"the president sighed .",
"the president thought that a sandwich sighed .",
"it perplexed the president that a sandwich ate Sally .",
"that a sandwich ate Sally perplexed the president .",
"the very very very perplexed president ate a sandwich .",
"the president worked on every proposal on the desk .",
"did Sally eat a sandwich ?",
"will Sally eat a sandwich ?",
"the pickle kissed the president that ate the sandwich .",
"the pickle kissed the sandwich that the president ate .",
"the pickle kissed the sandwich that the president thought that Sally ate .",
"what did the president think ?",
"what did the president think that Sally ate ?",
"what did Sally eat the sandwich with ?",
"who ate the sandwich ?",
"where did Sally eat the sandwich ?",
"Sally ate a sandwich .",
"the president sighed .",
"the president thought that a sandwich sighed .",
"it perplexed the president that a sandwich ate Sally .",
"that a sandwich ate Sally perplexed the president .",
"the very very very perplexed president ate a sandwich .",
"the president worked on every proposal on the desk ."
]

for ix, sent in enumerate(corpus):
    print(f"{ix+1}. {sent}")
    print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar_ec.gr"))
    # print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar3.gr"))
    # print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar5_ec.gr")) # YN Questions + G4
    # print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar4.gr"))
    # print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar_5_alt.gr")) # YN Questions + G4 + IF THEN

1. Sally ate a sandwich .
 (ROOT (S (NP (PropNoun Sally)) (VP (Verb ate) (NP (Det a) (Noun sandwich)))) .)
# P(best_parse) = 3.660e-05
# P(sentence) = 3.660e-05
# P(best_parse | sentence) = 1.000
# cross-entropy = 2.948 bits = -(-14.738 log-prob. / 5 words)

2. Sally and the president wanted and ate a sandwich .
 (ROOT (S (NP (NP (PropNoun Sally)) (Conj and) (NP (Det the) (Noun president))) (VP (Verb wanted) (Conj and) (VP (Verb ate) (NP (Det a) (Noun sandwich))))) .)
# P(best_parse) = 2.376e-10
# P(sentence) = 2.376e-10
# P(best_parse | sentence) = 1.000
# cross-entropy = 3.197 bits = -(-31.971 log-prob. / 10 words)

3. the president sighed .
 (ROOT (S (NP (Det the) (Noun president)) (VP (Vintrans sighed))) .)
# P(best_parse) = 9.892e-05
# P(sentence) = 9.892e-05
# P(best_parse | sentence) = 1.000
# cross-entropy = 3.326 bits = -(-13.303 log-prob. / 4 words)

4. the president thought that a sandwich sighed .
 (ROOT (S (NP (Det the) (Noun president)) (VP (Vthat thought that) (S (NP (De

In [None]:
rejection_corpus = [
    "the president thought that a sandwich sighed a pickle .",
    "the president sighed a pickle .",
    "the president ate a pickle sighed .",
    "the president ate that a pickle sighed .",
    "the president worked that a pickle sighed ."
]

for ix, sent in enumerate(rejection_corpus):
    print(f"{ix+1}. {sent}")
    # print(get_probabilities(sentence=sent, grammar_file="extra-grammars/grammar5_ec.gr")) # YN Questions + Grammar 4

1. the president thought that a sandwich sighed a pickle .
failure
# P(best_parse) = NaN
# P(sentence) = 0.000e+00
# P(best_parse | sentence) = NaN
# cross-entropy = Inf bits = -(-Inf log-prob. / 10 words)

2. the president sighed a pickle .
failure
# P(best_parse) = NaN
# P(sentence) = 0.000e+00
# P(best_parse | sentence) = NaN
# cross-entropy = Inf bits = -(-Inf log-prob. / 6 words)

3. the president ate a pickle sighed .
failure
# P(best_parse) = NaN
# P(sentence) = 0.000e+00
# P(best_parse | sentence) = NaN
# cross-entropy = Inf bits = -(-Inf log-prob. / 7 words)

4. the president ate that a pickle sighed .
 (ROOT (S (NP (Det the) (Noun president)) (VP (Verb ate) (NP that (S (NP (Det a) (Noun pickle)) (VP (Vintrans sighed)))))) .)
# P(best_parse) = 7.587e-10
# P(sentence) = 7.587e-10
# P(best_parse | sentence) = 1.000
# cross-entropy = 3.787 bits = -(-30.296 log-prob. / 8 words)

5. the president worked that a pickle sighed .
failure
# P(best_parse) = NaN
# P(sentence) = 0.000e+00


In [19]:
get_cross_entropy(grammar="extra-grammars/grammar2.gr", n_sentences=2), get_cross_entropy(grammar="extra-grammars/grammar2.gr", n_sentences=10), get_cross_entropy(grammar="extra-grammars/grammar2.gr", n_sentences=100), get_cross_entropy(grammar="extra-grammars/grammar2.gr", n_sentences=1000)

(1.7775615434661347,
 2.2321521388043197,
 2.0982670024467636,
 2.1805701413073253)

In [187]:
get_cross_entropy(grammar="extra-grammars/grammar3.gr", n_sentences=2), get_cross_entropy(grammar="extra-grammars/grammar3.gr", n_sentences=10), get_cross_entropy(grammar="extra-grammars/grammar3.gr", n_sentences=100), get_cross_entropy(grammar="extra-grammars/grammar3.gr", n_sentences=1000)

(3.3556441629018336, 2.7169802318465712, 2.752090398455231, 2.748644130792881)

In [13]:
with open("corpus.txt", "r") as tfile:
    corpus = list(filter(lambda sent: len(sent)>0,  tfile.read().split("\n")))
    corpus = list(map(lambda sent: re.sub(r"'", " ", sent),  corpus))
get_cross_entropy(grammar="grammar.gr", n_sentences=10, corpus=corpus)

a president understood the pickle on every chief of staff in the pickle in a pickle with every chief of staff under every president on a president on a sandwich under a perplexed floor on every pickle under the sandwich in the president in a perplexed pickle in every pickle with the chief of staff on every perplexed sandwich under every president in a floor with the sandwich under every president in a pickled pickled chief of staff with a sandwich under the floor with a floor with every floor with a floor in a pickled floor under the delicious president with a chief of staff with a pickle with the chief of staff on every chief of staff under the perplexed fine president with every chief of staff under the perplexed pickle on the pickle on the chief of staff with every pickle on every chief of staff in the pickle on every sandwich in the floor in every pickle on every floor on the floor on the delicious floor in the fine floor with the president with a pickle under a perplexed pickle un

1.2083352739249638

In [16]:
sent = "a president understood the pickle on every chief of staff in the pickle in a pickle with every chief of staff under every president on a president on a sandwich under a perplexed floor on every pickle under the sandwich in the president in a perplexed pickle in every pickle with the chief of staff on every perplexed sandwich under every president in a floor with the sandwich under every president in a pickled pickled chief of staff with a sandwich under the floor with a floor with every floor with a floor in a pickled floor under the delicious president with a chief of staff with a pickle with the chief of staff on every chief of staff under the perplexed fine president with every chief of staff under the perplexed pickle on the pickle on the chief of staff with every pickle on every chief of staff in the pickle on every sandwich in the floor in every pickle on every floor on the floor on the delicious floor in the fine floor with the president with a pickle under a perplexed pickle under every floor with a floor on a chief of staff with every president with the pickled floor with every floor under a floor in a chief of staff on every floor under every chief of staff with every pickle on the floor with the chief of staff under a floor on every sandwich with every president with every chief of staff in'!"

'a president understood the pickle on every chief of staff in the pickle in a pickle with every chief of staff under every president on a president on a sandwich under a perplexed floor on every pickle under the sandwich in the president in a perplexed pickle in every pickle with the chief of staff on every perplexed sandwich under every president in a floor with the sandwich under every president in a pickled pickled chief of staff with a sandwich under the floor with a floor with every floor with a floor in a pickled floor under the delicious president with a chief of staff with a pickle with the chief of staff on every chief of staff under the perplexed fine president with every chief of staff under the perplexed pickle on the pickle on the chief of staff with every pickle on every chief of staff in the pickle on every sandwich in the floor in every pickle on every floor on the floor on the delicious floor in the fine floor with the president with a pickle under a perplexed pickle u