In [1]:
import scripts.utils.nlp_utils as nlp
import scripts.utils.grammar as gra
import scripts.utils.string_handling as string_hand
import scripts.utils.data_handler as data_hand

Loading spacy ...
Finished


# Load Test Data

In [2]:
#test_data = data_hand.read_test_data("textProcessing_testKaldi.csv")
test_data = data_hand.read_test_data("kaldi_test_data_real_v1.tsv")

# Read Reference Grammar and Diff Grammar

In [3]:
reference_grammar = gra.read_grammar_and_create_map('referenceGrammar.xml')
diff_grammar = gra.read_grammar_and_create_map('diff_rg_1.xml')
grammar = gra.merge_grammars(reference_grammar, diff_grammar)

In [4]:
string_hand.clear_sentence("no i will still have a ticket for billy elliot please")

'i will still have a ticket for billy elliot'

# Meaning Map

In [5]:
def is_slice_in_list(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    return any(s == l[i:len_s+i] for i in range(len(l) - len_s+1))

def is_slice_in_list2(s,l):
    len_s = len(s) #so we don't recompute length of s on every iteration
    for i in range(len(l) - len_s+1):
        if s==l[i:len_s+i]:
            return i
        
def extract_by_pattern(patterns, tags, words):
    extracted_words = []
    tags_string = " ".join(tags)
    for pattern in patterns:
        pattern_string = " ".join(pattern)
        if is_slice_in_list(pattern, tags):
            pattern_start_index = is_slice_in_list2(pattern, tags)
            if len(pattern) > 1:
                #pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                pattern_end_index = pattern_start_index + len(pattern) -1
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                        extracted_part = ""
                        for i in range(pattern_start_index, pattern_end_index+1):
                            extracted_part += words[i] + " "
                        extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
            if len(pattern) > 1:
                pattern_end_index = tags.index(pattern[-1], pattern_start_index+1)
                if pattern_end_index - pattern_start_index +1 != len(pattern):
                    pattern_end_index = tags.index(pattern[-1], pattern_end_index+1)
                    
                if pattern_end_index - pattern_start_index + 1 == len(pattern):
                    extracted_part = ""
                    for i in range(pattern_start_index, pattern_end_index+1):
                        extracted_part += words[i] + " "
                        

                    extracted_words.append(extracted_part[:-1])
            else:
                extracted_words.append(words[pattern_start_index])
            """
    return extracted_words
    

def extract_key_dt_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['DT', 'NN', 'NN'], ['DT', 'NN', 'NNS'],
                         ['DT', 'NN']]

    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    return extracted_words
    
def extract_key_nouns(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    #words = " ".join(nlp_sent[0])
    #[['DT', 'NN', 'NN']]
    prio_one_patterns = [['PRP$', 'NN', 'NN'], ['PRP$', 'NN', 'NNS'], ['PRP$', 'NNS', 'NNS'],\
                         ['PRP$', 'NN'], ['PRP$', 'NNS'], ['NN', 'NNS'], ['NN', 'NN'], ['RB', 'NN'], ['JJ', 'NNS'],\
                         ['JJ', 'NN'], ['NN'], ['NNS']]
    
    # Only when no prio_one_pattern fits
    prio_sec_patterns = [['NN'], ['NNS']]
    extracted_words = extract_by_pattern(prio_one_patterns, tags, words)
    if len(extracted_words) < 1:
        extracted_words = extract_by_pattern(prio_sec_patterns, tags, words)
    return extracted_words


def generate_dt_nouns_by_key_nouns(nlp_nouns):
    #FIX ME a credit cards does work
    dts = ['a', 'the']
    patterns = [['NN', 'NN'], ['NN', 'NNS'], ['NN']]
    tags = nlp_nouns[2]
    words = nlp.spacy_words_to_string_array(nlp_nouns[0])
    
    generated_words = []
    for pattern in patterns:
        if pattern == tags:
            for dt in dts:
                sentence = dt + " " +" ".join(words)
                generated_words.append(sentence)
    return generated_words

def generalise_aux_verb(nlp_sent):
    tags = nlp_sent[2]
    words = nlp.spacy_words_to_string_array(nlp_sent[0])
    sentence = " ".join(words)
    patterns = [['PRP', 'MD', 'VB', 'TO'], ['PRP', 'MD', 'VB', 'DT'], ['PRP', 'VBP', 'TO']]

    extracted_words = extract_by_pattern(patterns, tags, words)
    return extracted_words 
    
def create_meaning_map(grammar):
    prompt_noun_map = {}
    for prompt in grammar:
        try:
            extracted_nouns = []
            for response in grammar[prompt]:
                nlp_s = nlp.nlp_sentence(response)
                nouns = extract_key_nouns(nlp_s)
                if len(nouns) > 0:
                    extracted_nouns.extend(nouns)
            prompt_noun_map[prompt] = list(set(extracted_nouns))
        except:
            print("error")
            print(prompt)
    return prompt_noun_map

prompt_noun_map = create_meaning_map(grammar)

## Extract Aux verbs from total reference grammar

In [36]:
def aux_extractor():
    found_aux_verb = []
    for key in grammar:
        for item in grammar[key]:
            nlp_s = nlp.nlp_sentence(item)
            aux_verb = generalise_aux_verb(nlp_s)
            if len(aux_verb) > 0:
                found_aux_verb.extend(aux_verb)
    print(len(set(found_aux_verb)))
    list(set(found_aux_verb))
generalise_aux_verb(nlp.nlp_sentence("I wish to pay by card"))

['I wish to']

In [37]:
def extract_nouns_total(prompts, grammar):
    nouns = []
    dt_nouns = []
    for prompt in prompts:
        for response in grammar[prompt]:
            string_it =" ".join(extract_key_nouns(nlp.nlp_sentence(response)))
            if "ma " in string_it:
                print(response)
                print(prompt)
            else:
                nouns.extend(extract_key_nouns(nlp.nlp_sentence(response)))   
                dt_nouns.extend(extract_key_dt_nouns(nlp.nlp_sentence(response)))   

    generated_nouns = []
    for noun in list(set(nouns)):
        gen = generate_dt_nouns_by_key_nouns(nlp.nlp_sentence(noun))
        if len(gen) > 0:
            generated_nouns.extend(gen)

    return list(set(nouns)), list(set(dt_nouns)), list(set(generated_nouns))

## Helper Method

In [6]:
def get_transcript_processed_unique(item):
    transcript = item["transcript"]
    processed =item["processed"]
    unique_sentence = item["unique"]
            
    return transcript, processed, unique_sentence

# Apply Reference Grammar And Preprocessing And Unqiue

In [7]:
false_counter = 0
correct_counter = 0

false_prompts = {}
safe_prompts = []
for prompt_unit in test_data:
    
    for dict_prompt in test_data[prompt_unit]: 
        transcript = dict_prompt['transcript']
        sentence = transcript
        if "***" in sentence:
            sentence = sentence.replace("***", "")
        
        processed = sentence
        
        
        try:
            processed = string_hand.clear_sentence(transcript)
        except:
            print(dict_prompt)


        unique_sentence = string_hand.get_unique_sentence(sentence)
        if sentence not in grammar[prompt_unit] and  \
            processed not in grammar[prompt_unit] and \
            unique_sentence not in grammar[prompt_unit]:
                
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit),"transcript": transcript, "processed": processed, "unique": unique_sentence}
            false_counter += 1
            if prompt_unit in false_prompts:
                false_prompts[prompt_unit].append(item)
            else:
                arr = []
                arr.append(item)
                false_prompts[prompt_unit] = arr
            #false_prompts[dict_prompt['id']] = 
                #writer.write(prompt_unit + "\t" + sentence['transcript'] + "\n")
        else:
            item = {"id": dict_prompt['id'], "prompt": str(prompt_unit), "transcript": transcript, "processed": processed, "unique": unique_sentence, "method": "RG", "language": True, "meaning": True}
            safe_prompts.append(item)
print("Correct: %s" % str(len(safe_prompts)))
print("False: %s" % str(false_counter))

Correct: 578
False: 418


In [8]:
len(safe_prompts)

578

# Cluster Approach

In [9]:
def count_false_items():
    counter = 0
    for key in false_prompts:
        counter += len(false_prompts[key])
    return counter

In [10]:
count_false_items()

418

In [11]:
print(len(safe_prompts))

578


## Credit Card Cluster

In [12]:
import scripts.credit_card as credit_card

In [13]:
def meaning_is_correct(prompt_unit, transcript, clear_transcript, unique_sentence):
    nlp_transcript = nlp.nlp_sentence(transcript)
    nlp_clear = nlp.nlp_sentence(transcript)
    nlp_unqiue = nlp.nlp_sentence(unique_sentence)
    extracted_nouns_t = extract_key_nouns(nlp_transcript)
    extracted_nouns_c = extract_key_nouns(nlp_clear)
    extracted_nouns_u = extract_key_nouns(nlp_unqiue)
    
    for noun in extracted_nouns_t:
        try:
            if noun in prompt_noun_map[prompt_unit]:
                return True
        except:
            print(prompt_unit)
            print(prompt_noun_map[prompt_unit])
            print(noun)
    for noun in extracted_nouns_c:
        if noun in prompt_noun_map[prompt_unit]:
            return True
    for noun in extracted_nouns_u:
        if noun in prompt_noun_map[prompt_unit]:
            return True
    return False
        #remove_from_false_prompts(false_prompts, key, item['id'], method="magic", meaning="correct", language="correct")
    

meaning_is_correct("Sag: Ich möchte mit Mastercard bezahlen", "i want pay with the master card", "i want pay with the master card", "i want pay with the master card")
#prompt_noun_map["Sag: Ich möchte mit Kreditkarte bezahlen"]

True

In [14]:
test_prompts = ["Sag: Ich möchte mit Dollars bezahlen",
"Sag: Ich möchte mit Euros bezahlen",
"Sag: Ich möchte mit Kreditkarte bezahlen",
"Sag: Ich möchte mit Mastercard bezahlen",
"Sag: Ich möchte mit Postkarte bezahlen",
"Sag: Ich möchte mit Visa bezahlen",
"Sag: Ich möchte mit Pfund bezahlen",
"Sag: Ich möchte mit Schweizer Franken bezahlen"]

false_credit_card_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [15]:
def remove_from_false_prompts(false_prompts, key, id_, method="credit card cluster", meaning=False, language=False, debug=False):
    if key in false_prompts:
        items = false_prompts[key]
        for index in range(0, len(items)):
            if items[index]['id'] == id_:
                item = {"id": id_, 
                        "prompt": str(key), 
                        "transcript": items[index]['transcript'], 
                        "processed": items[index]["processed"], 
                        "method": method, "language": str(language), 
                        "meaning": str(meaning), 
                        "unique": items[index]["unique"]}

                if debug == False:
                    safe_prompts.append(item)
                    del false_prompts[key][index]
                return item 

In [16]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))

credit_card_items = []
for prompt_unit in test_prompts:
    for item in false_credit_card_prompts[prompt_unit]:
        meaning = False
        language = False
        try:
            processed =item["processed"]
            unique_sentence = item["unique"]
            transcript = item["transcript"]
        except:
            print(unique_sentence)
            continue
        if credit_card.accept_credit_card(transcript) == False and \
            credit_card.accept_credit_card(processed) == False and \
            credit_card.accept_credit_card(unique_sentence) == False:
            
            false_counter += 1
            #remove_from_false_prompts(false_prompts, key, item['id'])
        else:
            #
            
            language = True
        
        if meaning_is_correct(prompt_unit, transcript, processed, unique_sentence):
            meaning = True
            
        
        item = remove_from_false_prompts(false_prompts, prompt_unit, item['id'], meaning=meaning, language=language)
        if item:
            #print(item)
            if item['meaning'] == 'True' and item['language'] == 'True':
                correct_counter += 1
            credit_card_items.append(item)
            
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

162
Correct: 5
False: 44
633


In [17]:
def not_today():
    for item in credit_card_items:
        if item['meaning'] == str(True) and item['language']== str(True):
            print(item['id'] + ";accepted;"  + item['language'] + ";" + item['meaning'])
        else:
            print(item['id'] + ";rejected;"  + item['language'] + ";" + item['meaning'])

## Restarant Cluster

In [22]:
import scripts.restarant as resta

In [23]:
test_prompts = ["Frag: Ich möchte die Rechnung", "Frag: Ich möchte die Dessertkarte"]
false_restarant_prompts = data_hand.get_test_data_by_false_prompts(test_prompts, false_prompts, grammar)

In [24]:
false_counter = 0
correct_counter = 0

print(len(false_prompts))
for key in test_prompts:
    for item in false_restarant_prompts[key]:
        try:
            processed =item["processed"]
            unique_sentence = item["unique"]
            transcript = item["transcript"]
        except:
            continue
        if resta.accept_restarant(transcript) == False and \
            resta.accept_restarant(unique_sentence) == False and \
            resta.accept_restarant(processed) == False:
            false_counter += 1  
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster")
        else:
            remove_from_false_prompts(false_prompts, key, item['id'], method="Restarant Cluster", meaning="correct", language="correct")
            correct_counter += 1
print("Correct: %s" % str(correct_counter))
print("False: %s" % str(false_counter))
print(len(safe_prompts))

162
Correct: 0
False: 13
646


In [25]:
count_false_items()

350

In [26]:
len(safe_prompts)

646

## Tickets Cluster

In [28]:
ticket_key_nouns = ['ticket', 'tickets']

def accept_ticket_meaning(key_nouns_of_prompts, item):
    transcript, processed, unique = get_transcript_processed_unique(item)

    nlp_t = nlp.nlp_sentence(transcript)
    nlp_p = nlp.nlp_sentence(processed)
    nlp_u = nlp.nlp_sentence(unique)
    
    ticket_state = False
    key_noun_state = False
    for ticket_key_noun in ticket_key_nouns:
        if ticket_key_noun in transcript or ticket_key_noun in processed or ticket_key_noun in unique:
            ticket_state = True
            break
    for key_noun in key_nouns_of_prompts:
        if key_noun in transcript or key_noun in processed or key_noun in unique:
            key_noun_state = True
            break
    
    return ticket_state & key_noun_state

In [29]:
def accept_ticket_prompts(false_ticket_prompts, ticket_prompts, key_nouns, debug=False):
    for prompt in false_ticket_prompts:
        if prompt not in ticket_prompts:
            print("this prompt: %s : is not inside the ticket prompts" % prompt)
        for item in false_ticket_prompts[prompt]:
            if debug:
                print(str(accept_ticket_meaning(key_nouns, item)) + "\t" + item['processed'])
            else:
                print(accept_ticket_meaning(key_nouns, item))
    

### King Of Lions

In [30]:
king_of_lions_prompts = [
"Frag: 2 Tickets für König der Löwen",
"Frag trotzdem: ein Ticket für König der Löwen",
"Frag: 3 Tickets für König der Löwen",
"Frag: 4 Tickets für König der Löwen",
"Frag: ein Ticket für König der Löwen"]

king_of_lions_key_nouns = ['the lion king', 'lion king']
false_king_of_lions_prompts = data_hand.get_test_data_by_false_prompts(king_of_lions_prompts, false_prompts, grammar)

In [31]:
accept_ticket_prompts(false_king_of_lions_prompts, king_of_lions_prompts, king_of_lions_key_nouns)

False
False
False
False


### Mamma mia

In [32]:
mamma_mia_prompts = [
"Frag: 4 Tickets für Mamma Mia",
"Frag: ein Ticket für Mamma Mia",
"Frag trotzdem: ein Ticket für Mamma Mia",
"Frag: 2 Tickets für Mamma Mia",
"Frag: 3 Tickets für Mamma Mia"]

mamma_mia_key_nouns = ['mamma mia']
false_mamma_mia_prompts = data_hand.get_test_data_by_false_prompts(mamma_mia_prompts, false_prompts, grammar)

accept_ticket_prompts(false_mamma_mia_prompts, mamma_mia_prompts, mamma_mia_key_nouns)

True
True


### Notting Hill

In [33]:
notting_hill_prompts = ["Frag: ein Ticket nach Notting Hill"]
notting_hill_key_nouns = ["notting hill", "nothing hill"]
false_notting_hill_prompts = data_hand.get_test_data_by_false_prompts(notting_hill_prompts, false_prompts, grammar)

accept_ticket_prompts(false_notting_hill_prompts, notting_hill_prompts, notting_hill_key_nouns)

False
False
False
False
False
True


### National Gallery

In [34]:
national_gallery_prompts = ["Frag: Tickets für die National Gallery"]
notting_hill_key_nouns = ['national gallery', 'gallery']

In [38]:
extract_nouns_total(national_gallery_prompts, grammar)

(['ticket', 'tickets', 'national gallery', 'gallery', 'i'],
 ['a ticket'],
 ['a ticket', 'a gallery', 'the ticket', 'the gallery'])

### Places Green Park / Trafalgar / Picca

In [39]:
place_prompts = ["Frag: ein Ticket zum Green Park", 
                 "Frag: ein Ticket zum Piccadilly Circus", 
                 "Frag: ein Ticket zum Trafalgar Square"]
place_key_nouns = ['trafalgar square', 'green park', 'piccadilly circus']
false_place_prompts = data_hand.get_test_data_by_false_prompts(place_prompts, false_prompts, grammar)
accept_ticket_prompts(false_place_prompts, place_prompts, place_key_nouns, True)

True	can i have a ticket to the green park
True	are i would like a ticket to piccadilly circus
True	a ticket from piccadilly circus
True	like a ticket to piccadilly circus
False	tickets to the have bill three tickets
False	i want a ticket to pick is okay tickets
True	i want a ticket to piccadilly circus tickets
True	can i have a ticket to the trafalgar square


In [40]:
false_place_prompts['Frag: ein Ticket zum Piccadilly Circus'][3]

{'id': '4308',
 'processed': 'tickets to the have bill three tickets',
 'prompt': 'Frag: ein Ticket zum Piccadilly Circus',
 'transcript': 'tickets to the please have bill three tickets',
 'unique': 'tickets to the please have bill three'}

In [41]:
nlp.nlp_sentence("can i have a ticket to trafalgar square")

([can, i, have, a, ticket, to, trafalgar, square],
 ['can', 'i', 'have', 'a', 'ticket', 'to', 'trafalgar', 'square'],
 ['MD', 'PRP', 'VB', 'DT', 'NN', 'IN', 'VB', 'JJ'],
 ['MD', 'PRP', 'VB', 'DT', 'NN', 'IN', 'VB', 'JJ'])

In [42]:
extract_nouns_total(notting_hill_prompts, grammar)

(['ticket'], ['a ticket'], ['a ticket', 'the ticket'])

### Abend Cluster

In [43]:
montag_prompts = [
    "Frag: 2 Tickets für Montagabend",
    "Frag: 3 Tickets für Montagabend",
    "Frag: 4 Tickets für Montagabend",
    "Frag: ein Ticket für Montagabend",
    "Frag: Tickets für Montagabend"]
montag_key_nouns = ['monday', 'monday night', 'monday evening']

dienstag_prompts = [
    "Frag: ein Ticket für Dienstagabend", 
    "Frag: Tickets für Dienstagabend",
    "Frag: 2 Tickets für Dienstagabend", 
    "Frag: 3 Tickets für Dienstagabend", 
    "Frag: 4 Tickets für Dienstagabend"]
dienstag_key_nouns = ['tuesday', 'tuesday evening', 'tuesday night']

mittwoch_prompts = [
    "Frag: 2 Tickets für Mittwochabend",
    "Frag: 3 Tickets für Mittwochabend",
    "Frag: 4 Tickets für Mittwochabend",
    "Frag: ein Ticket für Mittwochabend",
    "Frag: Tickets für Mittwochabend"]
mittwoch_key_nouns = ['wednesday evening', 'wednesday night', 'wednesday']

donnerstag_prompts = [
    "Frag: 2 Tickets für Donnerstagabend",
    "Frag: 3 Tickets für Donnerstagabend",
    "Frag: 4 Tickets für Donnerstagabend",
    "Frag: ein Ticket für Donnerstagabend",
    "Frag: Tickets für Donnerstagabend"]
donnerstag_key_nouns = ['thursday', 'thursday evening', 'thursday night']

freitag_prompts =[
    "Frag: 2 Tickets für Freitagabend",
    "Frag: 3 Tickets für Freitagabend",
    "Frag: 4 Tickets für Freitagabend",
    "Frag: ein Ticket für Freitagabend",
    "Frag: Tickets für Freitagabend"]
freitag_key_nouns = ['friday', 'friday night', 'friday evening']

samstag_prompts = [
    "Frag: 2 Tickets für Samstagabend",
    "Frag: 3 Tickets für Samstagabend",
    "Frag: 4 Tickets für Samstagabend",
    "Frag: ein Ticket für Samstagabend",
    "Frag: Tickets für Samstagabend"]
samstag_key_nouns = ['saturday', 'saturday night', 'saturday evening']

sonntag_prompts = [
    "Frag: 2 Tickets für Sonntagabend",
    "Frag: 3 Tickets für Sonntagabend",
    "Frag: 4 Tickets für Sonntagabend",
    "Frag: ein Ticket für Sonntagabend",
    "Frag: Tickets für Sonntagabend"]
sonntag_key_nouns = ['sunday', 'sunday night', 'sunday evening']

heuteabend_prompts = [
    "Frag: 2 Tickets für heute Abend",
    "Frag: 3 Tickets für heute Abend",
    "Frag: 4 Tickets für heute Abend",
    "Frag: ein Ticket für heute Abend",
    "Frag: Tickets für heute Abend"]
heuteabend_key_nouns = ['tonight', 'evening']

morgenabend_prompts = [
    "Frag: 2 Tickets für morgen Abend",
    "Frag: 3 Tickets für morgen Abend",
    "Frag: 4 Tickets für morgen Abend",
    "Frag: ein Ticket für morgen Abend",
    "Frag: Tickets für morgen Abend"]
morgenabend_key_nouns = ['tomorrow night', 'tomorrow evening', 'tomorrow']

In [44]:
extract_nouns_total(morgenabend_prompts, grammar)

(['tomorrow evening',
  'tomorrow',
  'one ticket',
  'tickets',
  'tomorrow night',
  'one',
  'ticket'],
 ['a ticket', 'the evening'],
 ['a ticket',
  'the tomorrow',
  'a tomorrow night',
  'the tomorrow evening',
  'a tomorrow',
  'a tomorrow evening',
  'the tomorrow night',
  'the ticket'])

### Ticket Cluster Pipeline / Meaning Pipeline

In [45]:
def prompt_in_ticket_cluster(prompt):
    if prompt in mamma_mia_prompts:
        return True
    elif prompt in king_of_lions_prompts:
        return True
    elif prompt in notting_hill_prompts:
        return True
    elif prompt in place_prompts:
        return True
    elif prompt in montag_prompts:
        return True
    elif prompt in dienstag_prompts:
        return True
    elif prompt in mittwoch_prompts:
        return True
    elif prompt in donnerstag_prompts:
        return True
    elif prompt in freitag_prompts:
        return True
    elif prompt in samstag_prompts:
        return True
    elif prompt in sonntag_prompts:
        return True
    elif prompt in heuteabend_prompts:
        return True
    elif prompt in morgenabend_prompts:
        return True
    elif prompt in national_gallery_prompts:
        return True
    return False
        
def ticket_cluster_pipeline(prompt, item):
    if prompt in mamma_mia_prompts:
        return accept_ticket_meaning(mamma_mia_key_nouns, item)
    elif prompt in king_of_lions_prompts:
        return accept_ticket_meaning(king_of_lions_key_nouns, item)
    elif prompt in notting_hill_prompts:
        return accept_ticket_meaning(notting_hill_key_nouns, item)
    elif prompt in place_prompts:
        return accept_ticket_meaning(place_key_nouns, item)
    elif prompt in montag_prompts:
        return accept_ticket_meaning(montag_key_nouns, item)
    elif prompt in dienstag_prompts:
        return accept_ticket_meaning(dienstag_key_nouns, item)
    elif prompt in mittwoch_prompts:
        return accept_ticket_meaning(mittwoch_key_nouns, item)
    elif prompt in donnerstag_prompts:
        return accept_ticket_meaning(donnerstag_key_nouns, item)
    elif prompt in freitag_prompts:
        return accept_ticket_meaning(freitag_key_nouns, item)
    elif prompt in samstag_prompts:
        return accept_ticket_meaning(samstag_key_nouns, item)
    elif prompt in sonntag_prompts:
        return accept_ticket_meaning(sonntag_key_nouns, item)
    elif prompt in heuteabend_prompts:
        return accept_ticket_meaning(heuteabend_key_nouns, item)
    elif prompt in morgenabend_prompts:
        return accept_ticket_meaning(morgenabend_key_nouns, item)
    elif prompt in national_gallery_prompts:
        return accept_ticket_meaning(notting_hill_key_nouns, item)
    return False

In [47]:
test_prompt = 'Frag: ein Ticket für Mamma Mia'
test_item = false_mamma_mia_prompts[test_prompt][0]
print(test_item)
if prompt_in_ticket_cluster(test_prompt) == True:
    meaning = ticket_cluster_pipeline(test_prompt, test_item)
    print(meaning)

{'prompt': 'Frag: ein Ticket für Mamma Mia', 'id': '3838', 'unique': 'i want a ticket for mamma mia', 'processed': 'i want a ticket for mamma mia', 'transcript': 'i want a ticket for mamma mia'}
True


# Magic

In [49]:
import copy

In [50]:
def insert(tree, key, value):
    #print(key)
    if key:
        first, rest = key[0], key[1:]
        if first not in tree:
            tree[first] = {}
        insert(tree[first], rest, value)
    else:
        tree['key'] = True

In [51]:
tree = {}
for prompt_unit in grammar:
    
    for response in grammar[prompt_unit]:
        tags = nlp.nlp_sentence(response)[2]
        insert(tree, tags, "true")

In [52]:
def existintree(tree, array, rest_tree):
    if len(array) == 0:
        return False
    if array[0] not in rest_tree:
        return False
    if array[0] in rest_tree:
        if 'key' in rest_tree[array[0]] and len(array) == 1:
            return True
    #print(tree[array[0]])
    
    return existintree(tree, array[1:], rest_tree[array[0]])

In [82]:
def sing_plu(nlp_sent):
    words = nlp_sent[0]
    tags = nlp_sent[2]
    if 'CD' in tags:
        index_cd = tags.index('CD')
        if str(words[index_cd]) == 'one':
            if 'NNS' not in tags:
                return True
            else:
                return False
        else:
            if len(words) > index_cd + 1 and str(tags[index_cd+1]) == 'NNS':
                return True
            
            elif 'NN' not in tags:
                return True
            else:
                return False 
    return True

def iter_items(prompts, prompt_unit, debug=False):
    magic_accepted_prompts = []
    counter = 0
    
    for prompt in prompts:
        processed = prompt["processed"]
        transcript = prompt["transcript"]
        id_ = prompt["id"]
        nlp_processed = nlp.nlp_sentence(processed)
        nlp_processed_tags = nlp_processed[2]
        nlp_transcript = nlp.nlp_sentence(transcript)
        nlp_transcript_tags = nlp_transcript[2]
        
        meaning = False
        meaning_reject = False
        if prompt_in_ticket_cluster(prompt_unit) == True:
            meaning = ticket_cluster_pipeline(prompt_unit, prompt)
            if meaning == False:
                meaning_reject = True
            
        if existintree(tree, nlp_processed_tags, copy.deepcopy(tree)) == True or \
        existintree(tree, nlp_transcript_tags, copy.deepcopy(tree)) == True:
            if sing_plu(nlp_processed) == True or sing_plu(nlp_transcript) == True:
                item = {"id": id_ , "transcript": transcript, "processed": processed, "language":True, "meaning":meaning}  

                if meaning_reject == True:
                    item = remove_from_false_prompts(false_prompts, prompt_unit, item['id'], method="meaning_reject", meaning=meaning, language=True, debug=debug)
                    print("meaning reject \t id: %s" % str(item['id']))
                    continue
                else:
                    if meaning == True:
                        item = remove_from_false_prompts(false_prompts, prompt_unit, item['id'], method="magic", meaning=meaning, language=True, debug=debug)

                        counter += 1
                    else:
                        extracted_nouns = extract_key_nouns(nlp_processed)
                        for noun in extracted_nouns:
                            if noun in prompt_noun_map[prompt_unit]:
                                #print(item)
                                counter += 1
                                item = remove_from_false_prompts(false_prompts, prompt_unit, item['id'], method="magic", meaning=True, language=True, debug=debug)
                                break
                magic_accepted_prompts.append(item)
                        
    return magic_accepted_prompts, counter

In [83]:
correct_counter = 0
magic_accepted_prompts_map = {}
debug = False
for key in false_prompts:
    
    if debug == False:
        accepted_prompts, counter = iter_items(false_prompts[key], key)
        magic_accepted_prompts_map[key] = accepted_prompts
        correct_counter += counter
    else:
        accepted_prompts, counter = iter_items(false_prompts[key], key, debug)
        correct_counter += counter
print(correct_counter)

meaning reject 	 id: 3628
meaning reject 	 id: 4542
meaning reject 	 id: 4294
13


In [55]:
with open("annotated_kaldi_data_v8.csv", "w") as writer:
    for item in safe_prompts:
        accepted = False
        if str(item['language']) == str(True) and str(item['meaning']) == str(True): 
            accepted = True
        writer.write(item['id'] + "\t" + item['method'] + "\t" + item["prompt"] + "\t" +item['processed'] + "\t" + str(item['language'])+ "\t" + str(item['meaning']) + "\t" + str(accepted)+ "\n" )
    for key in false_prompts:
        for item in false_prompts[key]:
            writer.write(item['id'] + "\t" + "no method" + "\t"+ key + "\t" + item['processed'] + "\t" +'FALSE'+ "\t" + 'FALSE' + "\t" + "FALSE" + "\n" )

In [80]:
def read_attributes(item):
    split = item.replace("\n", "").split("\t")
    id_ = split[0]
    method_ = split[1]
    prompt_ = split[2]
    transcript_ = split[3]
    language_ = split[4]
    meaning_ = split[5]
    judgement_ = split[6]
    return id_, {
        "method":method_, 
        "prompt":prompt_, 
        "transcript":transcript_, 
        "language": language_, 
        "meaning":meaning_, 
        "judgement":judgement_}

def create_id_map_from_generated_data(file):
    data = None
    data_map = {}
    with open(file, "r") as reader1:
        data = reader1.readlines()
        for dat in data:
            id_, item = read_attributes(dat)
            data_map[id_] = item
    return data_map
            
def compare_results(this_results, last_results):
    for id_ in this_results:
        if id_ not in last_results:
            print("ERROR: Why is this id %s not inside the old results?" % id_)
        difference_string = "%s \n" % str(id_)
        difference= False
        if str(this_results[id_]['judgement']) != str(last_results[id_]['judgement']):
            difference_string += "Different judgement: NOW: %s, OLD: %s \n" % (this_results[id_]['judgement'],  str(last_results[id_]['judgement']))
            difference = True
        if str(this_results[id_]['transcript']) != str(last_results[id_]['transcript']):
            difference_string += "Different Transcripts: \nNOW: %s \nOLD: %s \n" % (this_results[id_]['transcript'],  str(last_results[id_]['transcript']))
            difference = True
        
        if str(this_results[id_]['meaning']) != str(last_results[id_]['meaning']):
            difference_string += "Different Meaning: NOW: %s, OLD: %s \n" % (this_results[id_]['meaning'],  str(last_results[id_]['meaning']))
            difference = True
            
        if str(this_results[id_]['language']) != str(last_results[id_]['language']):
            difference_string += "Different Language: NOW: %s, OLD: %s \n" % (this_results[id_]['language'],  str(last_results[id_]['language']))
            difference = True
            
        if difference:
            print(difference_string)
            
this_results = create_id_map_from_generated_data("results/annotated_kaldi_data_v8.csv")
old_results = create_id_map_from_generated_data("results/annotated_kaldi_data_v7.csv")
compare_results(this_results, old_results)

3605 
Different judgement: NOW: True, OLD: FALSE 
Different Meaning: NOW: True, OLD: FALSE 
Different Language: NOW: True, OLD: FALSE 

4532 
Different judgement: NOW: False, OLD: True 
Different Meaning: NOW: False, OLD: True 

3583 
Different judgement: NOW: False, OLD: True 
Different Meaning: NOW: False, OLD: True 

