# IPAI Homework 2. English 2 Lark translator

#### Work done by Pavel Tishkin, p.tishkin@innpolis.university

## Installing the nesessary libraries

In [None]:
!pip install lark-parser
!pip install nltk

## References

In [20]:
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://www.nltk.org/book/ch03.html
# https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer
# Information Retreival cource, Lab with stemmer.

## Importing nesessary libraries

In [21]:
from lark import Lark
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

## Defining EBNF structure

In [22]:
eng2lark_parser = Lark(r"""
    ?value: deftemplate
          | assert
          | defrule
    
    assert: "there exist" template (property)+
    deftemplate: template "templat properti" (propertyname)+
    defrule: "if" assert "then" assert
    property: propertyname propertyvalue
    template: WORD
    propertyname: WORD
    propertyvalue: WORD
    
    %import common.WORD
    %ignore " "
""", start='value')

## Parsers for the BNF structures

In [32]:
def get_original(word, original_tokens):
    return list(filter(lambda x: word in x.lower(), original_tokens)).pop()

def slot_to_str(slot):
    return f'(slot, {slot})'

def parse_deftemplate(tree, original_tokens):
    template_stemmed = tree.children[0].children[0].lower()
    template_name = get_original(template_stemmed, original_tokens).lower()
    template_properties = [get_original(w.children[0].lower(), original_tokens).lower() for w in tree.children[1:]]
    result = f'(deftemplate {template_name}\n  '
    for p in template_properties:
        result += f' (slot {p})'
    return result + ')', {template_name: template_properties}

def parse_assert(tree, original_tokens, models=None, add_assert=True):
    # Getting name of the model from original tokens
    template_stemmed = tree.children[0].children[0].lower()
    template_name = get_original(template_stemmed, original_tokens).lower()
    # A simple flag for defrules. One of its asserts does not need "(assert )" in it.
    if add_assert:
        result = f'(assert ({template_name}'
    else:
        result = f'({template_name}'
    template_properties = [(p.children[0].children[0].lower(), p.children[1].children[0].lower()) for p in tree.children[1:]]
    # Idea is the following. We take values from the original sentence, slot names are either taken stemmed
    # Or they are taken from the model slot names if such model exists
    for i in range(len(template_properties)):
        # Getting value from the original
        name = get_original(template_properties[i][1], original_tokens)
        # Getting slot stemmed
        slot = template_properties[i][0]
        # Or getting it from the existing model
        if models != None:
            if template_name in models.keys():
                model = models[template_name]
                for j in range(len(model)):
                    if slot in model[j]:
                        slot = model[j]
        result += f' ({slot} "{name}")'
    # Additional parenthesies for "(assert )"
    if add_assert:
        result += ')'
    return result + ')'

def parse_defrule(tree, original_tokens, rulename, models=None):
    # Defrule is a simple combination of two asserts.
    return f'(defrule\n   {rulename} {parse_assert(tree.children[0], original_tokens, models, add_assert=False)} => {parse_assert(tree.children[1], original_tokens, models)})'

## Reading input 

Input should be typed from the 'input.txt'. Each new sentence should start from the new line. The result is going to be in the file output.txt

In [41]:
sentence = ''

tokenizer = RegexpTokenizer(r'\w+')
stemmer = PorterStemmer()

with open('input.txt', 'r') as inp:
    with open('output.txt', 'w+') as out:
        text = inp.readlines()
        models = {}
        for i in range(len(text)):
            sentence = text[i]
            # Tokenizing and getting rid of punctuation
            tokenized_sentence = tokenizer.tokenize(sentence)
            # Lowercasing
            lowercase = [w.lower() for w in tokenized_sentence]
            # Getting rid of stopwords. Stopwords IF, THERE, THEN are reserved by BNF for defining assertions and defrules.
            used_stopwords = list(filter(lambda word: (word != 'if' and word != 'there' and word != 'then'), stopwords.words('english')))
            filtered = list(filter(lambda word: word not in used_stopwords , lowercase))
            # Stemming words. Useful when verbs are used instead of nouns for asserts.
            stemmed = [stemmer.stem(w) for w in filtered]
            filtered_sentence = " ".join(stemmed)
            try:
                sent_tree = eng2lark_parser.parse(filtered_sentence)
                if sent_tree.data == 'deftemplate':
                    res, model = parse_deftemplate(sent_tree, tokenized_sentence)
                    out.write(res + '\n')
                for k in model.keys():
                    models[k] = model[k]
                if sent_tree.data == 'assert':
                    out.write(parse_assert(sent_tree, tokenized_sentence, models) + '\n')
                if sent_tree.data == 'defrule':
                    out.write(parse_defrule(sent_tree, tokenized_sentence, f'rule{i}', models) + '\n')
            except Exception:
                out.write(f'Error in line {i+1}. Failed to parse sentences!\n')                   

## Advantages

1. Somewhat models the behaviour of the language
2. If the model was defined previously, the program will get the slot names from the already defined model
3. Can work out cases when the slot name is given in the form of verb (named -> name)

## Disadvantages

1. Can not work out the cases where (slotname, value) are swapped to (value, slotname)
2. Generally limited to the syntax of the system
3. If the model was not defined, some of the inferences would not be correct due to stemming (figure -> figur, for instance)