In [None]:
import scripts.utils.nlp_utils as nlp
import scripts.utils.grammar as gra
import scripts.utils.string_handling as string_hand
import scripts.utils.data_handler as data_hand

# Load Test Data

In [None]:
#test_data = data_hand.read_test_data("textProcessing_testKaldi.csv")
test_data = data_hand.read_test_data("kaldi_test_data_real_v1.tsv")

# Read Reference Grammar and Diff Grammar

In [None]:
reference_grammar = gra.read_grammar_and_create_map('referenceGrammar.xml')
diff_grammar = gra.read_grammar_and_create_map('diff_rg_1.xml')
grammar = gra.merge_grammars(reference_grammar, diff_grammar)

In [None]:
string_hand.clear_sentence("no i will still have a ticket for billy elliot please")

# Meaning Map

In [None]:
def is_slice_in_list(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    return any(s == l[i:len_s+i] for i in range(len(l) - len_s+1))

def is_slice_in_list2(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    for i in range(len(l) - len_s+1):
        if s==l[i:len_s+i]:
            return i
        
def extract_by_pattern(patterns, tags, words):
    extracted_words = []
    tags_string = " ".join(tags)
    for pattern in patterns:
        pattern_string = " ".join(pattern)
        if is_slice_in_list(pattern, tags):
            pattern_start_index = is_slice_in_list2(pattern, tags)
            if len(pattern) > 1:
                #pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                pattern_end_index = pattern_start_index + len(pattern) -1
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                        extracted_part = ""
                        for i in range(pattern_start_index, pattern_end_index+1):
                            extracted_part += words[i] + " "
                        extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
            if len(pattern) > 1:
                pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                if pattern_end_index - pattern_start_index +1 != len(pattern):
                    pattern_end_index = tags.index(pattern[-1], pattern_end_index+1)
                    
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                    extracted_part = ""
                    for i in range(pattern_start_index, pattern_end_index+1):
                        extracted_part += words[i] + " "
                        

                    extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
    return extracted_words
    

def extract_key_dt_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['DT', 'NN', 'NN'], ['DT', 'NN', 'NNS'],
                         ['DT', 'NN']]

    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    return extracted_words
    
def extract_key_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['PRP$', 'NN', 'NN'], ['PRP$', 'NN', 'NNS'], ['PRP$', 'NNS', 'NNS'],\
                         ['PRP$', 'NN'], ['PRP$', 'NNS'], ['NN', 'NNS'], ['NN', 'NN'], ['RB', 'NN'], ['JJ', 'NNS'],\
                         ['JJ', 'NN'], ['NN'], ['NNS']]
    
    # Only when no prio_one_pattern fits
    prio_sec_patterns = [['NN'], ['NNS']]
    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    if len(extracted_words) < 1:
        extracted_words = extract_by_pattern(prio_sec_patterns, tags, words)
    return extracted_words


def generate_dt_nouns_by_key_nouns(nlp_nouns):
    dts = ['a', 'the']
    patterns = [['NN', 'NN'], ['NN', 'NNS'], ['NN']]
    tags = nlp_nouns[2]
    words = nlp.spacy_words_to_string_array(nlp_nouns[0])
    
    generated_words = []
    for pattern in patterns:
        if pattern == tags:
            for dt in dts:
                sentence = dt + " " +" ".join(words)
                generated_words.append(sentence)
    return generated_words

def generalise_aux_verb(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    patterns = [['PRP', 'MD', 'VB', 'TO'], ['PRP', 'MD', 'VB', 'DT'], ['PRP', 'VBP', 'TO']]

    extracted_words = extract_by_pattern(patterns, tags, words)
    return extracted_words 
    
def create_meaning_map(grammar):
    prompt_noun_map = {}
    for prompt in grammar:
        try:
            extracted_nouns = []
            for response in grammar[prompt]:
                nlp_s = nlp.nlp_sentence(response)
                nouns = extract_key_nouns(nlp_s)
                if len(nouns) > 0:
                    extracted_nouns.extend(nouns)
            prompt_noun_map[prompt] = list(set(extracted_nouns))
        except:
            print(prompt)
    return prompt_noun_map

## Extract Aux verbs from total reference grammar

In [None]:
found_aux_verb = []
for key in grammar:
    for item in grammar[key]:
        nlp_s = nlp.nlp_sentence(item)
        aux_verb = generalise_aux_verb(nlp_s)
        if len(aux_verb) > 0:
            found_aux_verb.extend(aux_verb)
print(len(set(found_aux_verb)))

In [None]:
list(set(found_aux_verb))

In [None]:
generalise_aux_verb(nlp.nlp_sentence("I wish to pay by card"))

In [None]:
prompt_noun_map = create_meaning_map(grammar)

# Apply Reference Grammar And Preprocessing And Unqiue

In [None]:
false_counter = 0
correct_counter = 0

false_prompts = {}
safe_prompts = []
for prompt_unit in test_data:
    
    for dict_prompt in test_data[prompt_unit]: 
        transcript = dict_prompt['transcript']
        sentence = transcript
        if "***" in sentence:
            sentence = sentence.replace("***", "")
        
        processed = sentence
        
        
        try:
            processed = string_hand.clear_sentence(transcript)
        except:
            print(dict_prompt)


        unique_sentence = string_hand.get_unique_sentence(sentence)
        if dict_prompt['id'] == '3796':
            print(transcript)
            print(unique_sentence)

        if sentence not in grammar[prompt_unit] and  \
            processed not in grammar[prompt_unit] and \
            unique_sentence not in grammar[prompt_unit]:
                
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit),"transcript": transcript, "processed": processed, "unique": unique_sentence}
            false_counter += 1
            if prompt_unit in false_prompts:
                false_prompts[prompt_unit].append(item)
            else:
                arr = []
                arr.append(item)
                false_prompts[prompt_unit] = arr
            #false_prompts[dict_prompt['id']] = 
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
        else:
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit), "transcript": transcript, "processed": processed, "unique": unique_sentence, "method": "RG", "language": True, "meaning": True}
            safe_prompts.append(item)
print("Correct: %s" % str(len(safe_prompts)))
print("False: %s" % str(false_counter))

In [None]:
safe_prompts

# Cluster Approach

In [None]:
def count_false_items():
    counter = 0
    for key in false_prompts:
        counter += len(false_prompts[key])
    return counter

In [None]:
count_false_items()

In [None]:
print(len(safe_prompts))

In [None]:
false_prompts['Sag: Ich möchte mit Dollars bezahlen'][0]

## Credit Card Cluster

In [133]:
import scripts.credit_card as credit_card

In [134]:
def meaning_is_correct(prompt_unit, transcript, clear_transcript, unique_sentence):
    nlp_transcript = nlp.nlp_sentence(transcript)
    nlp_clear = nlp.nlp_sentence(transcript)
    nlp_unqiue = nlp.nlp_sentence(unique_sentence)
    extracted_nouns_t = extract_key_nouns(nlp_transcript)
    extracted_nouns_c = extract_key_nouns(nlp_clear)
    extracted_nouns_u = extract_key_nouns(nlp_unqiue)
    
    for noun in extracted_nouns_t:
        try:
            if noun in prompt_noun_map[prompt_unit]:
                return True
        except:
            print(prompt_unit)
            print(prompt_noun_map[prompt_unit])
            print(noun)
    for noun in extracted_nouns_c:
        if noun in prompt_noun_map[prompt_unit]:
            return True
    for noun in extracted_nouns_u:
        if noun in prompt_noun_map[prompt_unit]:
            return True
    return False
        #remove_from_false_prompts(false_prompts, key, item['id'], method="magic", meaning="correct", language="correct")
    

meaning_is_correct("Sag: Ich möchte mit Mastercard bezahlen", "i want pay with the master card", "i want pay with the master card", "i want pay with the master card")
#prompt_noun_map["Sag: Ich möchte mit Kreditkarte bezahlen"]

True

In [135]:
test_prompts = ["Sag: Ich möchte mit Dollars bezahlen",
"Sag: Ich möchte mit Euros bezahlen",
"Sag: Ich möchte mit Kreditkarte bezahlen",
"Sag: Ich möchte mit Mastercard bezahlen",
"Sag: Ich möchte mit Postkarte bezahlen",
"Sag: Ich möchte mit Visa bezahlen",
"Sag: Ich möchte mit Pfund bezahlen",
"Sag: Ich möchte mit Schweizer Franken bezahlen"]

false_credit_card_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [136]:
def remove_from_false_prompts(false_prompts, key, id_, method="credit card cluster", meaning=False, language=False):
    if key in false_prompts:
        items = false_prompts[key]
        for index in range(0, len(items)):
            if items[index]['id'] == id_:
                item = {"id": id_, 
                        "prompt": str(key), 
                        "transcript": items[index]['transcript'], 
                        "processed": items[index]["processed"], 
                        "method": method, "language": str(language), 
                        "meaning": str(meaning), 
                        "unique": items[index]["unique"]}
                safe_prompts.append(item)
                del false_prompts[key][index]
                return item 

In [137]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))

credit_card_items = []
for prompt_unit in test_prompts:
    for item in false_credit_card_prompts[prompt_unit]:
        meaning = False
        language = False
        try:
            processed =item["processed"]
            unique_sentence = item["unique"]
            transcript = item["transcript"]
        except:
            print(unique_sentence)
            continue
        if credit_card.accept_credit_card(transcript) == False and \
            credit_card.accept_credit_card(processed) == False and \
            credit_card.accept_credit_card(unique_sentence) == False:
            
            false_counter += 1
            #remove_from_false_prompts(false_prompts, key, item['id'])
        else:
            #
            
            language = True
        
        if meaning_is_correct(prompt_unit, transcript, clear_item, unique_sentence):
            meaning = True
            
        
        item = remove_from_false_prompts(false_prompts, prompt_unit, item['id'], meaning=meaning, language=language)
        if item:
            #print(item)
            if item['meaning'] == 'True' and item['language'] == 'True':
                correct_counter += 1
            credit_card_items.append(item)
            
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

162
Correct: 0
False: 0
645


In [None]:
for item in credit_card_items:
    if item['meaning'] == str(True) and item['language']== str(True):
        print(item['id'] + ";accepted;"  + item['language'] + ";" + item['meaning'])
    else:
        print(item['id'] + ";rejected;"  + item['language'] + ";" + item['meaning'])

In [None]:
nouns = []
dt_nouns = []
for prompt in test_prompts:
    for response in grammar[prompt]:
        string_it =" ".join(extract_key_nouns(nlp.nlp_sentence(response)))
        if "ma " in string_it:
            print(response)
            print(prompt)
        else:
            nouns.extend(extract_key_nouns(nlp.nlp_sentence(response)))   
            dt_nouns.extend(extract_key_dt_nouns(nlp.nlp_sentence(response)))   

generated_nouns = []
for noun in list(set(nouns)):
    gen = generate_dt_nouns_by_key_nouns(nlp.nlp_sentence(noun))
    if len(gen) > 0:
        generated_nouns.extend(gen)

In [None]:
list(set(generated_nouns))

In [None]:
for index in range(0, len(correct_items)):
    if correct_items[index]['id'] == '3796':
        print(correct_items[index])

In [128]:
nlp.nlp_sentence("i wish to pay with visa")

([i, wish, to, pay, with, visa],
 ['i', 'wish', 'to', 'pay', 'with', 'visa'],
 ['PRP', 'VBP', 'TO', 'VB', 'IN', 'NN'],
 ['PRP', 'VBP', 'TO', 'VB', 'IN', 'NN'])

In [None]:
safe_prompts[-1]

## Restarant Cluster

In [138]:
import scripts.restarant as resta

In [139]:
test_prompts = ["Frag: Ich möchte die Rechnung", "Frag: Ich möchte die Dessertkarte"]
false_restarant_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [140]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))
for key in test_prompts:
    for item in false_restarant_prompts[key]:
        try:
            processed =item["processed"]
            unique_sentence = item["unique"]
            transcript = item["transcript"]
        except:
            continue
        if resta.accept_restarant(transcript) == False and \
            resta.accept_restarant(unique_sentence) == False and \
            resta.accept_restarant(processed) == False:
            false_counter += 1  
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster")
        else:
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster", meaning="correct", language="correct")
            correct_counter += 1
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

162
Correct: 0
False: 0
645


In [None]:
false_restarant_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [None]:
count_false_items()

In [None]:
safe_prompts

## Tickets Cluster

In [156]:
def extract_by_pattern(patterns, tags, words):
    extracted_words = []
    tags_string = " ".join(tags)
    for pattern in patterns:
        pattern_string = " ".join(pattern)
        if is_slice_in_list(pattern, tags):
            pattern_start_index = is_slice_in_list2(pattern, tags)
            if len(pattern) > 1:
                pattern_end_index = pattern_start_index + len(pattern) -1
                print(pattern_start_index)
                print(pattern_end_index)
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                        extracted_part = ""
                        print("here")
                        for i in range(pattern_start_index, pattern_end_index+1):
                            extracted_part += words[i] + " "
                        extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
            if len(pattern) > 1:
                pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                if pattern_end_index - pattern_start_index +1 != len(pattern):
                    pattern_end_index = tags.index(pattern[-1], pattern_end_index+1)
                    
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                    extracted_part = ""
                    for i in range(pattern_start_index, pattern_end_index+1):
                        extracted_part += words[i] + " "
                        

                    extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
    return extracted_words
def extract_key_dt_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['DT', 'NN', 'NN'], ['DT', 'NN', 'NNS']]

    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    return extracted_words

extract_key_dt_nouns(nlp_s)

6
8
here


['the lion king']

In [150]:
nlp_s = nlp.nlp_sentence(grammar['Frag: 3 Tickets für König der Löwen'][0])
print(nlp_s)
extract_key_dt_nouns(nlp_s)

([can, i, buy, three, tickets, for, the, lion, king], ['can', 'i', 'buy', 'three', 'ticket', 'for', 'the', 'lion', 'king'], ['MD', 'PRP', 'VB', 'CD', 'NNS', 'IN', 'DT', 'NN', 'NN'], ['MD', 'PRP', 'VB', 'CD', 'NNS', 'IN', 'DT', 'NN', 'NN'])


[]

# Magic

In [None]:
import copy

In [None]:
def insert(tree, key, value):
    #print(key)
    if key:
        first, rest = key[0], key[1:]
        if first not in tree:
            tree[first] = {}
        insert(tree[first], rest, value)
    else:
        tree['key'] = True

In [None]:
tree = {}
for prompt_unit in grammar:
    
    for response in grammar[prompt_unit]:
        tags = nlp.nlp_sentence(response)[2]
        insert(tree, tags, "true")

In [None]:
def existintree(tree, array, rest_tree):
    if len(array) == 0:
        return False
    if array[0] not in rest_tree:
        return False
    if array[0] in rest_tree:
        if 'key' in rest_tree[array[0]] and len(array) == 1:
            return True
    #print(tree[array[0]])
    
    return existintree(tree, array[1:], rest_tree[array[0]])

In [None]:
def sing_plu(nlp_sent):
    words = nlp_sent[0]
    tags = nlp_sent[2]
    if 'CD' in tags:
        index_cd = tags.index('CD')
        if str(words[index_cd]) == 'one':
            if 'NNS' not in tags:
                return True
            else:
                return False
        else:
            if len(words) > index_cd + 1 and str(tags[index_cd+1]) == 'NNS':
                return True
            
            elif 'NN' not in tags:
                return True
            else:
                return False 
    return True

def iter_items(prompts, prompt_unit):
    magic_accepted_prompts = []
    counter = 0
    for prompt in prompts:
        processed = prompt["processed"]
        transcript = prompt["transcript"]
        id_ = prompt["id"]
        nlp_processed = nlp.nlp_sentence(processed)
        nlp_processed_tags = nlp_processed[2]
        nlp_transcript = nlp.nlp_sentence(transcript)
        nlp_transcript_tags = nlp_transcript[2]
        if existintree(tree, nlp_processed_tags, copy.deepcopy(tree)) == True or \
        existintree(tree, nlp_transcript_tags, copy.deepcopy(tree)) == True:
            if sing_plu(nlp_processed) == True or sing_plu(nlp_transcript) == True:
                item = {"id": id_ , "transcript": transcript, "processed": processed}
                
                magic_accepted_prompts.append(item)
                extracted_nouns = extract_key_nouns(nlp_processed)
                for noun in extracted_nouns:
                    if noun in prompt_noun_map[prompt_unit]:
                        print(item)
                        counter += 1
                        remove_from_false_prompts(false_prompts, key, item['id'], method="magic", meaning="correct", language="correct")
                        
                
    return magic_accepted_prompts, counter

In [None]:
correct_counter = 0
magic_accepted_prompts_map = {}
for key in false_prompts:
    accepted_prompts, counter = iter_items(false_prompts[key], key)
    magic_accepted_prompts_map[key] = accepted_prompts
    correct_counter += counter
print(correct_counter)

In [None]:
print(len(safe_prompts))
safe_prompts

In [None]:
with open("annotated_kaldi_data_v1.csv", "w") as writer:
    for item in safe_prompts:
        writer.write(item['id'] + "\t" + item['method'] + "\t" + item["prompt"] + "\t" +item['processed'] + "\t" +item['language']+ "\t" + item['meaning'] + "\n" )
    for key in false_prompts:
        for item in false_prompts[key]:
            writer.write(item['id'] + "\t" + "no method" + "\t"+ key + "\t" + item['processed'] + "\t" +'incorrect'+ "\t" + 'incorrect' + "\n" )

In [None]:
prompt_noun_map['Frag: 1 Musical-Ticket']

In [None]:
t_prompt = magic_accepted_prompts_map['Frag: 1 Musical-Ticket'][0]["transcript"]
print(t_prompt)
nlp_t_prompt = nlp.nlp_sentence(t_prompt)
print(nlp_t_prompt)
extract_key_nouns(nlp_t_prompt)

In [None]:
print(false_prompts['Frag: 1 Musical-Ticket'])
iter_items(false_prompts['Frag: 1 Musical-Ticket'])