In [260]:
import scripts.utils.nlp_utils as nlp
import scripts.utils.grammar as gra
import scripts.utils.string_handling as string_hand
import scripts.utils.data_handler as data_hand

# Load Test Data

In [261]:
test_data = data_hand.read_test_data("textProcessing_testKaldi.csv")
test_data

{'Frag: 1 Musical-Ticket': [{'id': '3835',
   'transcript': 'i need one musical tickets'},
  {'id': '3898', 'transcript': 'can i have one musical tickets'},
  {'id': '3899', 'transcript': 'a ticket for wednesday evening'},
  {'id': '4415', 'transcript': 'can i have one musical tickets'},
  {'id': '4548', 'transcript': 'can i have one musical tickets'}],
 'Frag: 1 Ticket': [{'id': '3834', 'transcript': 'i need one ticket'},
  {'id': '3850', 'transcript': 'i need one ticket'},
  {'id': '3997', 'transcript': 'i need want tickets'}],
 'Frag: 2 Musical-Tickets': [{'id': '3623',
   'transcript': 'two like tickets please'},
  {'id': '3719', 'transcript': 'i want two musical tickets'},
  {'id': '3845', 'transcript': 'i need two musical tickets'},
  {'id': '3992', 'transcript': 'i need two musical tickets'},
  {'id': '3998', 'transcript': 'i need two musical ticket'}],
 'Frag: 2 Tickets': [{'id': '3670', 'transcript': 'i want two tickets please'},
  {'id': '3844', 'transcript': 'i need two tick

In [262]:
test_data = data_hand.read_test_data("kaldi_test_data.tsv")
test_data

{'Frag: 1 Musical-Ticket': [{'id': '3835',
   'transcript': 'i need one musical tickets'},
  {'id': '3898', 'transcript': 'can i have one musical tickets'},
  {'id': '3899', 'transcript': 'a ticket for wednesday evening'},
  {'id': '4415', 'transcript': 'can i have one musical tickets'},
  {'id': '4548', 'transcript': 'can i have one musical tickets'}],
 'Frag: 1 Ticket': [{'id': '3834', 'transcript': 'i need one ticket'},
  {'id': '3850', 'transcript': 'i need one ticket'},
  {'id': '3997', 'transcript': 'i need want tickets'}],
 'Frag: 2 Musical-Tickets': [{'id': '3623',
   'transcript': 'two like tickets please'},
  {'id': '3719', 'transcript': 'i want two musical tickets'},
  {'id': '3845', 'transcript': 'i need two musical tickets'},
  {'id': '3992', 'transcript': 'i need two musical tickets'},
  {'id': '3998', 'transcript': 'i need two musical ticket'}],
 'Frag: 2 Tickets': [{'id': '3670', 'transcript': 'i want two tickets please'},
  {'id': '3844', 'transcript': 'i need two tick

# Read Reference Grammar and Diff Grammar

In [263]:
reference_grammar = gra.read_grammar_and_create_map('referenceGrammar.xml')
diff_grammar = gra.read_grammar_and_create_map('diff_rg_1.xml')
grammar = gra.merge_grammars(reference_grammar, diff_grammar)

In [264]:
string_hand.clear_sentence("no i will still have a ticket for billy elliot please")

'i will still have a ticket for billy elliot'

# Apply Reference Grammar And Preprocessing And Unqiue

In [265]:
false_counter = 0
correct_counter = 0

false_prompts = {}
safe_prompts = []
for prompt_unit in test_data:
    
    for dict_prompt in test_data[prompt_unit]:     
        try:
            transcript = dict_prompt['transcript']
            sentence = string_hand.clear_sentence(transcript)
        except:
            print(dict_prompt)
        if "***" in sentence:
            continue

        unique_sentence = string_hand.get_unique_sentence(sentence)

        if sentence not in grammar[prompt_unit] and unique_sentence not in grammar[prompt_unit]:
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit),"transcript": transcript, "processed": sentence}
            false_counter += 1
            if prompt_unit in false_prompts:
                false_prompts[prompt_unit].append(item)
            else:
                arr = []
                arr.append(item)
                false_prompts[prompt_unit] = arr
            #false_prompts[dict_prompt['id']] = 
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
        else:
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit), "transcript": transcript, "processed": sentence, "method": "RG", "language": "correct", "meaning": "correct"}
            safe_prompts.append(item)
print("Correct: %s" % str(len(safe_prompts)))
print("False: %s" % str(false_counter))

{'transcript': 'no', 'id': '3892'}
{'transcript': 'no', 'id': '4251'}
Correct: 434
False: 562


In [266]:
safe_prompts

[{'id': '3700',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '3743',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '4398',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want one ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want one ticket to london'},
 {'id': '3995',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i would like to sit in the fifth row',
  'prompt': 'Sag: Ich möchte in der fünften Reihe sitzen',
  'transcript': 'i would like to sit in the fifth row'},
 {'id': '4420',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'proce

# Cluster Approach

In [267]:
def count_false_items():
    counter = 0
    for key in false_prompts:
        counter += len(false_prompts[key])
    return counter

In [268]:
count_false_items()

562

In [269]:
print(len(safe_prompts))

434


In [270]:
false_prompts['Sag: Ich möchte mit Dollars bezahlen'][0]

{'id': '3599',
 'processed': 'can i buy by post',
 'prompt': 'Sag: Ich möchte mit Dollars bezahlen',
 'transcript': 'can i buy by post'}

## Credit Card Cluster

In [271]:
import scripts.credit_card as credit_card

In [272]:
test_prompts = ["Sag: Ich möchte mit Dollars bezahlen",
"Sag: Ich möchte mit Euros bezahlen",
"Sag: Ich möchte mit Kreditkarte bezahlen",
"Sag: Ich möchte mit Mastercard bezahlen",
"Sag: Ich möchte mit Postkarte bezahlen",
"Sag: Ich möchte mit Visa bezahlen",
"Sag: Ich möchte mit Pfund bezahlen",
"Sag: Ich möchte mit Schweizer Franken bezahlen"]

false_credit_card_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [273]:
def remove_from_false_prompts(false_prompts, key, id_, method="credit card cluster", meaning="incorrect", language="incorrect"):
    if key in false_prompts:
        items = false_prompts[key]
        for index in range(0, len(items)):
            if items[index]['id'] == id_:
                item = {"id": id_, "prompt": str(key), "transcript": items[index]['transcript'], "processed": items[index]["processed"], "method": method, "language": language, "meaning": meaning}
                safe_prompts.append(item)
                del false_prompts[key][index]
                break 

In [275]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))
for key in test_prompts:
    for item in false_credit_card_prompts[key]:
        try:
            clear_item = string_hand.clear_sentence(item["processed"])
            transcript =string_hand.clear_sentence(item["transcript"])
        except:
            continue
        if credit_card.accept_credit_card(clear_item) == False and credit_card.accept_credit_card(transcript) == False:
            false_counter += 1 
            remove_from_false_prompts(false_prompts, key, item['id'])
        else:
            remove_from_false_prompts(false_prompts, key, item['id'], meaning="correct", language="correct")
            correct_counter += 1
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

197
Correct: 4
False: 57
495


In [276]:
safe_prompts

[{'id': '3700',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '3743',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '4398',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want one ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want one ticket to london'},
 {'id': '3995',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i would like to sit in the fifth row',
  'prompt': 'Sag: Ich möchte in der fünften Reihe sitzen',
  'transcript': 'i would like to sit in the fifth row'},
 {'id': '4420',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'proce

## Restarant Cluster

In [277]:
import scripts.restarant as resta

In [278]:
test_prompts = ["Frag: Ich möchte die Rechnung", "Frag: Ich möchte die Dessertkarte"]
false_restarant_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [279]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))
for key in test_prompts:
    for item in false_restarant_prompts[key]:
        try:
            clear_item = string_hand.clear_sentence(item["processed"])
            transcript =string_hand.clear_sentence(item["transcript"])
        except:
            continue
        if resta.accept_restarant(clear_item) == False and resta.accept_restarant(transcript) == False:
            false_counter += 1  
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster")
        else:
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster", meaning="correct", language="correct")
            correct_counter += 1
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

197
Correct: 0
False: 13
508


In [280]:
false_restarant_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [281]:
count_false_items()

488

In [282]:
safe_prompts

[{'id': '3700',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '3743',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '4398',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want one ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want one ticket to london'},
 {'id': '3995',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i would like to sit in the fifth row',
  'prompt': 'Sag: Ich möchte in der fünften Reihe sitzen',
  'transcript': 'i would like to sit in the fifth row'},
 {'id': '4420',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'proce

# Meaning Map

In [283]:
def is_slice_in_list(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    return any(s == l[i:len_s+i] for i in range(len(l) - len_s+1))

def is_slice_in_list2(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    for i in range(len(l) - len_s+1):
        if s==l[i:len_s+i]:
            return i
        
def extract_by_pattern(patterns, tags, words):
    extracted_words = []
    tags_string = " ".join(tags)
    for pattern in patterns:
        pattern_string = " ".join(pattern)
        if is_slice_in_list(pattern, tags):
            pattern_start_index = is_slice_in_list2(pattern, tags)
            if len(pattern) > 1:
                pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                        extracted_part = ""
                        for i in range(pattern_start_index, pattern_end_index+1):
                            extracted_part += words[i] + " "
                        extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
            if len(pattern) > 1:
                pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                if pattern_end_index - pattern_start_index +1 != len(pattern):
                    pattern_end_index = tags.index(pattern[-1], pattern_end_index+1)
                    
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                    extracted_part = ""
                    for i in range(pattern_start_index, pattern_end_index+1):
                        extracted_part += words[i] + " "
                        

                    extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
    return extracted_words
    
def extract_key_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['NN', 'NNS'], ['NN', 'NN'], ['RB', 'NN'], ['NN'], ['NNS']]
    
    # Only when no prio_one_pattern fits
    prio_sec_patterns = [['NN'], ['NNS']]
    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    if len(extracted_words) < 1:
        extracted_words = extract_by_pattern(prio_sec_patterns, tags, words)
    return extracted_words

extract_key_nouns(nlp_t_prompt)

['wednesday evening', 'ticket']

In [204]:
prompt_noun_map = {}
for prompt in grammar:
    try:
        extracted_nouns = []
        for response in grammar[prompt]:
            nlp_s = nlp.nlp_sentence(response)
            nouns = extract_key_nouns(nlp_s)
            if len(nouns) > 0:
                extracted_nouns.extend(nouns)
        prompt_noun_map[prompt] = list(set(extracted_nouns))
    except:
        print(set(extracted_nouns))

In [205]:
nlp_s = nlp.nlp_sentence(list(grammar["Frag trotzdem: ein Ticket für Mamma Mia"])[0])
print(nlp_s)
print(extract_key_nouns(nlp_s))

([can, i, have, a, ticket, for, mamma, mia], ['can', 'i', 'have', 'a', 'ticket', 'for', 'mamma', 'mia'], ['MD', 'PRP', 'VB', 'DT', 'NN', 'IN', 'NN', 'NN'], ['MD', 'PRP', 'VB', 'DT', 'NN', 'IN', 'NN', 'NN'])
['mamma mia', 'ticket']


In [124]:
prompt_noun_map

{'Auf Wiedersehen': ['goodbye'],
 'Danke': ['lot', 'thanks'],
 'Frag trotzdem: ein Ticket für Billy Elliot': ['ticket', 'billy elliot'],
 'Frag trotzdem: ein Ticket für König der Löwen': ['lion king', 'ticket'],
 'Frag trotzdem: ein Ticket für Mamma Mia': ['ticket', 'mamma mia'],
 'Frag trotzdem: ein Ticket für Oliver Twist': ['oliver twist', 'ticket'],
 'Frag trotzdem: ein Ticket für Starlight Express': ['starlight express',
  'ticket'],
 'Frag trotzdem: ein Ticket für Wicked': ['ticket'],
 'Frag trotzdem: ein Ticket für das Phantom der Oper': ['ticket'],
 'Frag: 1 Musical-Ticket': ['ticket'],
 'Frag: 1 Ticket': ['ticket', 'one', 'one ticket', 'ticket ticket'],
 'Frag: 1 Ticket für das Phantom der Oper': ['ticket', 'one', 'one ticket'],
 'Frag: 2 Musical-Tickets': ['tickets'],
 'Frag: 2 Tickets': ['tickets'],
 'Frag: 2 Tickets für Billy Elliot': ['tickets',
  'billy elliot',
  'billy',
  'elliot'],
 'Frag: 2 Tickets für Dienstagabend': ['tuesday',
  'tickets',
  'tuesday evening',
  '

# Magic

In [175]:
import copy

In [176]:
def insert(tree, key, value):
    #print(key)
    if key:
        first, rest = key[0], key[1:]
        if first not in tree:
            tree[first] = {}
        insert(tree[first], rest, value)
    else:
        tree['key'] = True

In [177]:
tree = {}
for prompt_unit in grammar:
    
    for response in grammar[prompt_unit]:
        tags = nlp.nlp_sentence(response)[2]
        insert(tree, tags, "true")

In [23]:
def existintree(tree, array, rest_tree):
    if len(array) == 0:
        return False
    if array[0] not in rest_tree:
        return False
    if array[0] in rest_tree:
        if 'key' in rest_tree[array[0]] and len(array) == 1:
            return True
    #print(tree[array[0]])
    
    return existintree(tree, array[1:], rest_tree[array[0]])

In [284]:
def sing_plu(nlp_sent):
    words = nlp_sent[0]
    tags = nlp_sent[2]
    if 'CD' in tags:
        index_cd = tags.index('CD')
        if str(words[index_cd]) == 'one':
            if 'NNS' not in tags:
                return True
            else:
                return False
        else:
            if len(words) > index_cd + 1 and str(tags[index_cd+1]) == 'NNS':
                return True
            
            elif 'NN' not in tags:
                return True
            else:
                return False 
    return True

def iter_items(prompts, prompt_unit):
    magic_accepted_prompts = []
    counter = 0
    for prompt in prompts:
        processed = prompt["processed"]
        transcript = prompt["transcript"]
        id_ = prompt["id"]
        nlp_processed = nlp.nlp_sentence(processed)
        nlp_processed_tags = nlp_processed[2]
        nlp_transcript = nlp.nlp_sentence(transcript)
        nlp_transcript_tags = nlp_transcript[2]
        if existintree(tree, nlp_processed_tags, copy.deepcopy(tree)) == True or \
        existintree(tree, nlp_transcript_tags, copy.deepcopy(tree)) == True:
            if sing_plu(nlp_processed) == True or sing_plu(nlp_transcript) == True:
                item = {"id": id_ , "transcript": transcript, "processed": processed}
                
                magic_accepted_prompts.append(item)
                extracted_nouns = extract_key_nouns(nlp_processed)
                for noun in extracted_nouns:
                    if noun in prompt_noun_map[prompt_unit]:
                        print(item)
                        counter += 1
                        remove_from_false_prompts(false_prompts, key, item['id'], method="magic", meaning="correct", language="correct")
                        
                
    return magic_accepted_prompts, counter

In [285]:
correct_counter = 0
magic_accepted_prompts_map = {}
for key in false_prompts:
    accepted_prompts, counter = iter_items(false_prompts[key], key)
    magic_accepted_prompts_map[key] = accepted_prompts
    correct_counter += counter
print(correct_counter)

{'transcript': 'i have a ticket to london', 'processed': 'i have a ticket to london', 'id': '4685'}
{'transcript': 'my name is card', 'processed': 'my name is card', 'id': '4478'}
{'transcript': 'i want a ticket for mamma mia', 'processed': 'i want a ticket for mamma mia', 'id': '3838'}
{'transcript': 'i want a ticket for mamma mia', 'processed': 'i want a ticket for mamma mia', 'id': '3838'}
{'transcript': 'i want tickets for tuesday evening', 'processed': 'i want tickets for tuesday evening', 'id': '3837'}
{'transcript': 'a red for four tickets', 'processed': 'a red for four tickets', 'id': '4261'}
{'transcript': 'i will pay four musical tickets', 'processed': 'i will pay four musical tickets', 'id': '4314'}
{'transcript': 'the capital of a market', 'processed': 'the capital of a market', 'id': '4493'}
{'transcript': 'i would like to sit in the sixth row', 'processed': 'i would like to sit in the sixth row', 'id': '3632'}
{'transcript': 'i want a rare steak', 'processed': 'i want a r

In [238]:
print(len(safe_prompts))
safe_prompts

515


[{'id': '3700',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '3743',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want a ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want a ticket to london'},
 {'id': '4398',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i want one ticket to london',
  'prompt': 'Frag: ein Ticket nach London',
  'transcript': 'i want one ticket to london'},
 {'id': '3995',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'processed': 'i would like to sit in the fifth row',
  'prompt': 'Sag: Ich möchte in der fünften Reihe sitzen',
  'transcript': 'i would like to sit in the fifth row'},
 {'id': '4420',
  'language': 'correct',
  'meaning': 'correct',
  'method': 'RG',
  'proce

In [286]:
with open("annotated_test_data.csv", "w") as writer:
    for item in safe_prompts:
        writer.write(item['id'] + "\t" + item['method'] + "\t" + item["prompt"] + "\t" +item['processed'] + "\t" +item['language']+ "\t" + item['meaning'] + "\n" )
    for key in false_prompts:
        for item in false_prompts[key]:
            writer.write(item['id'] + "\t" + "no method" + "\t"+ key + "\t" + item['processed'] + "\t" +'incorrect'+ "\t" + 'incorrect' + "\n" )

In [127]:
prompt_noun_map['Frag: 1 Musical-Ticket']

['ticket']

In [126]:
t_prompt = magic_accepted_prompts_map['Frag: 1 Musical-Ticket'][0]["transcript"]
print(t_prompt)
nlp_t_prompt = nlp.nlp_sentence(t_prompt)
print(nlp_t_prompt)
extract_key_nouns(nlp_t_prompt)

a ticket for wednesday evening
([a, ticket, for, wednesday, evening], ['a', 'ticket', 'for', 'wednesday', 'evening'], ['DT', 'NN', 'IN', 'NN', 'NN'], ['DT', 'NN', 'IN', 'NN', 'NN'])


['wednesday evening', 'ticket']

In [85]:
print(false_prompts['Frag: 1 Musical-Ticket'])
iter_items(false_prompts['Frag: 1 Musical-Ticket'])

[{'transcript': 'i need one musical tickets', 'processed': 'i need one musical tickets', 'id': '3835'}, {'transcript': 'can i have one musical tickets', 'processed': 'can i have one musical tickets', 'id': '3898'}, {'transcript': 'a ticket for wednesday evening', 'processed': 'a ticket for wednesday evening', 'id': '3899'}, {'transcript': 'can i have one musical tickets', 'processed': 'can i have one musical tickets', 'id': '4415'}, {'transcript': 'can i have one musical tickets', 'processed': 'can i have one musical tickets', 'id': '4548'}]
{'transcript': 'i need one musical tickets', 'processed': 'i need one musical tickets', 'id': '3835'}
i need one musical tickets
i need one musical tickets
{'transcript': 'can i have one musical tickets', 'processed': 'can i have one musical tickets', 'id': '3898'}
can i have one musical tickets
can i have one musical tickets
{'transcript': 'a ticket for wednesday evening', 'processed': 'a ticket for wednesday evening', 'id': '3899'}
a ticket for w

([{'id': '3899',
   'processed': 'a ticket for wednesday evening',
   'transcript': 'a ticket for wednesday evening'}],
 1)