In [1]:
import spacy
nlp_en = spacy.load('en')
nlp_de = spacy.load('de')
from spacy.en import English
parser = English()
import xml.etree.ElementTree as ET

## Load Reference Grammar

In [2]:
def create_prompt_response_map(gram):
    prompt_response_map = {}
    for prompt_unit in gram:
        key = None
        responses = []
        for prompt in prompt_unit:
            if prompt.tag == None:
                print("None")
            else:
                if(prompt.tag == 'prompt'):
                    key = prompt.text
                elif(prompt.tag == 'response'):
                    responses.append(prompt.text)
        prompt_response_map[key] = responses
    return prompt_response_map

def read_grammar_and_create_map(file):
    tree = ET.parse(file)
    grammar = tree.getroot()
    return create_prompt_response_map(grammar)

def get_sag_prompts(prompt_response_map):
    sag_prompts = {}
    for key in prompt_response_map:
        if "Sag" in key:
            sag_prompts[key] = prompt_response_map[key]
    return sag_prompts

    
prompt_response_map = read_grammar_and_create_map('referenceGrammar.xml')
sag_prompts = get_sag_prompts(prompt_response_map)

# Load Test Data

In [3]:
test_file_csv = "textProcessing_testKaldi.csv"
test_data = None
with open(test_file_csv, 'r') as reader:
    test_data = reader.readlines()  
    
def prompt_map_test_data():
    prompt_map = {}
    for item in test_data[1:]:
        split = item.replace("\n", "").split("\t")
        prompt = split[1]
        
        if prompt in prompt_map:
            prompt_map[prompt].append({'id': split[0], "transcript": split[3]})
        else:
            arr = []
            arr.append({'id':split[0], "transcript":split[3]})
            prompt_map[prompt] = arr
            
    return prompt_map
prompt_test_map = prompt_map_test_data()

In [42]:
def diff_of_two_maps(map_1, map_2):
    diff_map = {}
    for key in map_1:
        if key not in map_2:
            diff_map[key] = map_1[key]
    
    return diff_map

# 1. Apply Reference Grammar

In [38]:
counter = 0

false_prompts_rg_base = {}
for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue

        if sentence not in prompt_response_map[prompt_unit]:
            counter +=1
            false_prompts_rg_base[dict_prompt['id']] = sentence
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
        
len(false_prompts_rg_base)

461

# 2. Cut Whitespace

In [45]:
false_prompt_white_space_base = {}
counter =0
for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue
            
        sentence = ' '.join(sentence.split())
        if sentence not in prompt_response_map[prompt_unit]:
            counter +=1
            false_prompt_white_space_base[dict_prompt['id']] = sentence
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
counter

448

# 3. Special Case to Normal Case

In [46]:
special_case_map = {"don't": "do not",
 "haven't" : "have not",
 "i'd" : "i would",
 "i'm" : "i am",
 "isn't": "is not",
 "it's": "it is",
 "o'clock": "o'clock",
 "that's" :"that is",
 "there's": "there is"}

def sentence_to_normal_case(sentence):
    trans_sentence = ""
    words = sentence.split(" ")
    for word in words:
        if word in special_case_map:
            word = special_case_map[word]
        trans_sentence += word + " "
    
    return trans_sentence[:-1]

false_prompt_sc_to_nc = {}
counter =0
for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue
        if "'" in sentence:
            sentence = sentence_to_normal_case(sentence)
            #print(dict_prompt['transcript'])
            
        sentence = ' '.join(sentence.split())
        if sentence not in prompt_response_map[prompt_unit]:
            counter +=1
            false_prompt_sc_to_nc[dict_prompt['id']] = sentence
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
counter

437

## 3.1 Diff between White Space Maps (2.) and Special Case to Normal Case (3.)

In [49]:
diff_of_two_maps(false_prompt_white_space_base, false_prompt_sc_to_nc)

{'3705': "i'm from america",
 '3801': "i'd like to pay with post card",
 '3909': "i'm fine",
 '3931': "i'm fine",
 '3940': "i'm fine",
 '3945': "i'm looking for trousers",
 '4273': "i'm fine",
 '4344': "i'm good",
 '4438': "i'm good",
 '4450': "it's too expensive",
 '4659': "i'm fine"}

# 4. Remove Yes/No 

In [83]:
yes_no_filtered_prompt = {}
counter =0

def remove_yes_no(sentence):
    if "yes" in sentence or \
    "no" in sentence or \
    "thanks" in sentence or \
    "thank you" in sentence or \
    "please" in sentence:
        sentence = sentence.replace("yes ", "")\
        .replace("no ", "")\
        .replace("thanks ", "")\
        .replace("thank you ", "")\
        .replace("please ", "")\
        .replace("also", "")
        print(sentence)
    return sentence

for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue
        if "'" in sentence:
            sentence = sentence_to_normal_case(sentence)
            #print(dict_prompt['transcript'])
            
        sentence = ' '.join(sentence.split())
        if sentence not in prompt_response_map[prompt_unit]:
            if remove_yes_no(sentence) not in prompt_response_map[prompt_unit]:
                counter +=1
                yes_no_filtered_prompt[dict_prompt['id']] = sentence
                #if sentence == "Sag: Ich möchte mit Kreditkarte bezahlen":
                    #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
counter

can i have not a room for one room
i would like to nothing hill
i would like to notting hill
where is not a a hairdryer
i would like tickets for the mastercard please
i do not note
i do not not
can i will have a single room please
this is my pass card
two like tickets please
i ticket please
i do not want the
this is not what i have a water
the not what i water
that is not what i bar
this is not what i bar
this is not what white i card
i have only big note
i want please
i want please
i could not find a hairdryer
i pay is there a reservation
i have reservation
i have not got a reservation
i have not a reservation
where i have not have i have no
i do not have a reservation any
i do not i do not have a reservation
i have reservation
i do not i do not have a reservation
i am good
i have reservation
i have not a reservation
i do not have a reservation is bern
i do not where is bern
i have not a reservation
the i do not have a reservation i pay by is my name is
i have not a reservation
i have

437

# 5. Unqiue Words

In [180]:
unique_prompts_res = {}
counter =0

def get_unique_sentence(sentence):
    trans_sentence = ""
    dict_words = {}
    words = sentence.split(" ")
    for word in words:
        if word in dict_words:
            continue
        dict_words[word] = 1
        trans_sentence += word + " "
    return trans_sentence[:-1]

for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue
        if "'" in sentence:
            sentence = sentence_to_normal_case(sentence)
            #print(dict_prompt['transcript'])
            
        sentence = ' '.join(sentence.split())
        if sentence not in prompt_response_map[prompt_unit]:
            yes_no_sentence = remove_yes_no(sentence)
            if yes_no_sentence not in prompt_response_map[prompt_unit]:
                
                unique_sentence = get_unique_sentence(sentence)
                unique_yes_no_sentence = get_unique_sentence(yes_no_sentence)
                if unique_sentence not in prompt_response_map[prompt_unit] or unique_yes_no_sentence not in prompt_response_map[prompt_unit]:
                    counter +=1
                    unique_prompts_res[dict_prompt['id']] = sentence
                #if sentence == "Sag: Ich möchte mit Kreditkarte bezahlen":
                    #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
counter

can i have not a room for one room
i would like to nothing hill
i would like to notting hill
where is not a a hairdryer
i would like tickets for the mastercard please
i do not note
i do not not
can i will have a single room please
this is my pass card
two like tickets please
i ticket please
i do not want the
this is not what i have a water
the not what i water
that is not what i bar
this is not what i bar
this is not what white i card
i have only big note
i want please
i want please
i could not find a hairdryer
i pay is there a reservation
i have reservation
i have not got a reservation
i have not a reservation
where i have not have i have no
i do not have a reservation any
i do not i do not have a reservation
i have reservation
i do not i do not have a reservation
i am good
i have reservation
i have not a reservation
i do not have a reservation is bern
i do not where is bern
i have not a reservation
the i do not have a reservation i pay by is my name is
i have not a reservation
i have

432

## 5.1 Unique Words (5.) compare Special Case to No Case (3.)

In [82]:
diff_of_two_maps(false_prompt_sc_to_nc, unique_prompts_res)

{'3676': 'i want a a double room',
 '3986': 'no i do not no i do not have a reservation',
 '4015': 'no i do not no i do not have a reservation',
 '4020': 'is there a a lift',
 '4533': 'can i pay with visa visa'}

# 6. Pos Map of RG

In [84]:
def nlp_sentence(sentence):
    parsed = parser(sentence)
    lemmas = []
    words = []
    tags = []
    poss = []
    for i, token in enumerate(parsed):
        lemma = token.lemma_
        words.append(token)
        lemmas.append(lemma)
        tag = token.tag_
        pos = token.tag_
        tags.append(tag)
        poss.append(pos)
    return words, lemmas, tags, poss

In [130]:
def insert(tree, key, value):
    #print(key)
    if key:
        first, rest = key[0], key[1:]
        if first not in tree:
            tree[first] = {}
        insert(tree[first], rest, value)
    else:
        tree['key'] = True

In [131]:
tree = {}
tag_array = nlp_sentence("i want a a double room")[2]
insert(tree, tag_array, "true")

In [132]:
tree = {}
for item in prompt_response_map:
    
    for response in prompt_response_map[item]:
        tags = nlp_sentence(response)[2]
        insert(tree, tags, "true")

In [133]:
tree

{'CD': {'JJ': {'NN': {'UH': {'key': True}, 'key': True},
   'NNS': {'VBP': {'key': True}, 'key': True}},
  'NN': {'IN': {'DT': {'NN': {'IN': {'DT': {'NN': {'UH': {'key': True},
         'key': True}}},
      'NN': {'UH': {'key': True}, 'key': True},
      'UH': {'key': True},
      'key': True}},
    'JJ': {'NN': {'key': True}, 'UH': {'key': True}, 'key': True},
    'NN': {'NN': {'UH': {'key': True}, 'key': True},
     'UH': {'key': True},
     'key': True},
    'RB': {'NN': {'UH': {'key': True}, 'key': True}},
    'VB': {'key': True}},
   'UH': {'key': True},
   'key': True},
  'NNS': {'IN': {'DT': {'NN': {'IN': {'DT': {'NN': {'UH': {'key': True},
         'key': True}}},
      'NN': {'UH': {'key': True}, 'key': True},
      'UH': {'key': True},
      'key': True}},
    'JJ': {'UH': {'key': True}, 'key': True},
    'NN': {'NN': {'UH': {'key': True}, 'key': True},
     'UH': {'key': True},
     'key': True},
    'RB': {'NN': {'key': True}},
    'VB': {'key': True}},
   'UH': {'key': Tr

In [213]:
def existintree(tree, array):
    if len(array) == 0:
        return False
    if array[0] not in tree:
        return False
    if array[0] in tree:
        if 'key' in tree[array[0]] and len(array) == 1:
            return True
    #print(tree[array[0]])
    return existintree(tree, array[1:])

#'CD': {'JJ': {'NN': {'UH'
#existintree(tree, ["CD", "asdf", "JJ", "NN"])

In [220]:
tag_array = nlp_sentence("i want a double room")[2]
#for tag in tag_array:
print(tag_array)

existintree(tree, tag_array)

['PRP', 'VBP', 'DT', 'JJ', 'NN']


True

In [221]:
exist_in_tree_map = {}
counter =0
for prompt_unit in prompt_test_map:
    for dict_prompt in prompt_test_map[prompt_unit]:
        
        sentence = dict_prompt['transcript']
        if "***" in sentence:
            continue
        if "'" in sentence:
            sentence = sentence_to_normal_case(sentence)
            #print(dict_prompt['transcript'])
            
        sentence = ' '.join(sentence.split())
        if sentence not in prompt_response_map[prompt_unit]:
            yes_no_sentence = remove_yes_no(sentence)
            if yes_no_sentence not in prompt_response_map[prompt_unit]:
                
                unique_sentence = get_unique_sentence(sentence)
                unique_yes_no_sentence = get_unique_sentence(yes_no_sentence)
                if unique_sentence not in prompt_response_map[prompt_unit] or unique_yes_no_sentence not in prompt_response_map[prompt_unit]:
                    if existintree(tree, nlp_sentence(unique_yes_no_sentence)[2]) == False:
                        counter +=1
                        exist_in_tree_map[dict_prompt['id']] = sentence
                #if sentence == "Sag: Ich möchte mit Kreditkarte bezahlen":
                    #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
counter
#exist_in_tree_map

can i have not a room for one room
i would like to nothing hill
i would like to notting hill
where is not a a hairdryer
i would like tickets for the mastercard please
i do not note
i do not not
can i will have a single room please
this is my pass card
two like tickets please
i ticket please
i do not want the
this is not what i have a water
the not what i water
that is not what i bar
this is not what i bar
this is not what white i card
i have only big note
i want please
i want please
i could not find a hairdryer
i pay is there a reservation
i have reservation
i have not got a reservation
i have not a reservation
where i have not have i have no
i do not have a reservation any
i do not i do not have a reservation
i have reservation
i do not i do not have a reservation
i am good
i have reservation
i have not a reservation
i do not have a reservation is bern
i do not where is bern
i have not a reservation
the i do not have a reservation i pay by is my name is
i have not a reservation
i have

254

In [222]:
diff_of_two_maps(unique_prompts_res, exist_in_tree_map)

{'3574': 'have a fitness studio',
 '3595': 'pay a i would like pommes',
 '3602': 'i would like a hotel',
 '3608': 'i would like please',
 '3610': 'no i have you the gym',
 '3621': 'i am a',
 '3634': 'i ticket please',
 '3637': 'have a room',
 '3642': 'this is there the hotel bar',
 '3647': 'i have a brothers',
 '3648': 'i have a brown boots',
 '3651': 'this is my passport',
 '3671': 'no i pay is there a reservation',
 '3679': 'have you have a pool',
 '3712': 'where is my passport',
 '3713': 'that is like post card',
 '3716': 'i want pay',
 '3726': 'single room',
 '3727': 'single room',
 '3729': 'i do not note',
 '3768': 'is there a room',
 '3770': 'i have no reservation',
 '3773': 'i have not got a reservation',
 '3774': 'i have not a reservation',
 '3779': 'where no i have not have i have no',
 '3787': 'is there a bar',
 '3789': 'where is there a bar',
 '3794': 'a twin room',
 '3804': 'i would like visa',
 '3807': 'no i do not have a reservation any',
 '3817': 'the is my passport',
 '

In [194]:
print(nlp_sentence("can i have a hotel room for one nights")[2])
len(tree['MD']['PRP']['VB']['DT']['NN']['NN']['UH'])

['MD', 'PRP', 'VB', 'DT', 'NN', 'NN', 'IN', 'CD', 'NNS']


1

In [182]:
len(unique_prompts_res)

432

In [183]:

#diff_of_two_maps(unique_prompts_res, exist_in_tree_map)

342


In [None]:
exist_in_tree_map