In [170]:
from datasets import load_dataset

dataset = load_dataset("multi_woz_v22")
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [171]:
def filterDomains(data):
    
    return [entry for entry in data if set(entry["services"]).issubset({"restaurant", "hotel", "booking"}) and len(entry["services"]) == 1]
# Only keep dialogues related to Restaurants or Hotels.

train_data_filtered = filterDomains(train_data)
val_data_filtered = filterDomains(val_data)
test_data_filtered = filterDomains(test_data)

In [172]:
def add_data_to_be_retrieved(dataset, print_dialogue=False):
    """
    Augment the dataset with the following information:
    - Information to be retrieved (ground truth)
    
    Heavily inspired by the code from the evaluation script.
    """
    
    for dialogue in dataset:
        turns = dialogue["turns"]
        turns["to_be_retrieved_ground_truth"] = {turn_id: [] for turn_id in range(len(turns["turn_id"]))}
        
        
        for turn_id, _ in enumerate(turns["turn_id"]):
            # If it is SYSTEM turn:
            if turns["speaker"][turn_id]:
                slot_names_per_act = [slot['slot_name'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
                slot_values_per_act = [slot['slot_value'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
                dialogue_acts = turns['dialogue_acts'][turn_id]['dialog_act']['act_type']
                services = turns['frames'][turn_id]['service']
                current_booking_service = [service for service in services if service in ["hotel", "restaurant"]]

                to_be_retrieved_ground_truth = []
                for act_i in range(len(slot_names_per_act)):
                    domain = dialogue_acts[act_i].split("-")[0].lower()
                    if domain == "booking" and len(current_booking_service) ==1:
                        domain = current_booking_service[0]
                    slot_names = [domain+"-"+slot_names_per_act[act_i][slot_i] for slot_i in range(len(slot_names_per_act[act_i]))
                                    if slot_values_per_act[act_i][slot_i]!="?" and slot_names_per_act[act_i][slot_i]!="none"]
                    if slot_names:
                        to_be_retrieved_slot_names = ["%s-availability" % (domain)] + slot_names
                        while domain+"-choice" in to_be_retrieved_slot_names:
                            del to_be_retrieved_slot_names[to_be_retrieved_slot_names.index(domain+"-choice")]
                        to_be_retrieved_ground_truth.extend(to_be_retrieved_slot_names)
                to_be_retrieved_ground_truth = sorted(list(set(to_be_retrieved_ground_truth)))
                
                # augment the dataset
                turns["to_be_retrieved_ground_truth"][turn_id].extend(to_be_retrieved_ground_truth)
                
                if print_dialogue:
                    print(f"Utterance: {turns['utterance'][turn_id]}")
                    print(f"To be retrieved: {to_be_retrieved_ground_truth}")
        if print_dialogue:        
            print("-"*50)
                       

In [173]:
def add_data_to_be_provided(dataset):
    """
    Augment the dataset with the following information:
    - Information to be provided (ground truth)
    
    Heavily inspired by the code from the evaluation script.
    """
    for dialogue in dataset:
        turns = dialogue["turns"]
        turns["to_be_provided_overall"] = {turn_id: [] for turn_id in range(len(turns["turn_id"]))}
        
        
        for turn_id, _ in enumerate(turns["turn_id"]):
            # If it is SYSTEM turn:
            if turns["speaker"][turn_id]:
                slot_names_per_act = [slot['slot_name'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
                slot_values_per_act = [slot['slot_value'] for slot in turns['dialogue_acts'][turn_id]['dialog_act']['act_slots']]
                dialogue_acts = turns['dialogue_acts'][turn_id]['dialog_act']['act_type']
                services = turns['frames'][turn_id]['service']
                current_booking_service = [service for service in services if service in ["hotel", "restaurant"]]
                to_be_provided_overall = []
                
                for act_i in range(len(slot_names_per_act)):
                    domain = dialogue_acts[act_i].split("-")[0].lower()
                    if domain == "booking" and len(current_booking_service)==1:
                        domain = current_booking_service[0]
                    if domain in ["hotel", "restaurant", "booking", "general"]:
                        slot_names_vlues = [domain+"-"+slot_names_per_act[act_i][slot_i]+":"+slot_values_per_act[act_i][slot_i] for slot_i in range(len(slot_names_per_act[act_i]))
                                            if slot_values_per_act[act_i][slot_i]!="?" and slot_names_per_act[act_i][slot_i]!="none"]
                        if slot_names_vlues and any((slot_name_value.split(":")[0]!=domain+"-none" for slot_name_value in slot_names_vlues)) and not "-No" in dialogue_acts[act_i]:
                            to_be_provided = ["%s-availability:yes" % (domain)] + slot_names_vlues
                            to_be_provided_overall.extend(to_be_provided)
                        elif "-No" in dialogue_acts[act_i]:
                            to_be_provided = ["%s-availability:no" % (domain)] + slot_names_vlues
                            to_be_provided_overall.extend(to_be_provided)
                to_be_provided_overall = sorted(list(set(to_be_provided_overall)))
                remove_avail_no_list = [elem for elem in to_be_provided_overall if elem.endswith("availability:no")]
                for remove_avail in remove_avail_no_list:
                    remove_avail_yes = remove_avail[:-2]+"yes"
                    while remove_avail_yes in to_be_provided_overall:
                        del to_be_provided_overall[to_be_provided_overall.index(remove_avail_yes)]
                turns["to_be_provided_overall"][turn_id].extend(to_be_provided_overall)

In [175]:
add_data_to_be_provided(train_data_filtered)
add_data_to_be_retrieved(train_data_filtered)

In [180]:

for turn_id in range(10):
    print(train_data_filtered[turn_id]["turns"]["utterance"][1])
    print("To be provided: ")
    print(train_data_filtered[turn_id]["turns"]["to_be_provided_overall"][1])
    print("To be retrieved: ")
    print(train_data_filtered[turn_id]["turns"]["to_be_retrieved_ground_truth"][1])
    print("-"*50)

I have 4 different options for you. I have two cheaper guesthouses and two expensive hotels. Do you have a preference?
To be provided: 
['hotel-availability:yes', 'hotel-choice:4', 'hotel-pricerange:cheaper', 'hotel-pricerange:expensive', 'hotel-type:guesthouses', 'hotel-type:hotels']
To be retrieved: 
['hotel-availability', 'hotel-pricerange', 'hotel-type']
--------------------------------------------------
I've heard good things about the lucky star. Need a reservation?
To be provided: 
['restaurant-availability:yes', 'restaurant-name:the lucky star']
To be retrieved: 
['restaurant-availability', 'restaurant-name']
--------------------------------------------------
Eraina is a European restaurant in the centre area. Their address is St. Michael's Church Trinity Street City Centre and phone number 01223 355166.
To be provided: 
["restaurant-address:St. Michael's Church Trinity Street City Centre", 'restaurant-area:centre', 'restaurant-availability:yes', 'restaurant-food:European', 're

# 1. Predict what shall be retrieved

#### Idea: Predict what the agent should retrieve based on the user utterance, dialogue act and the previous agent utterance.
    

# 2. Predict dialogue acts of the agent

#### We can and should use the results of the retrieval, i.e. use slots which we requested at the first step and their values (including "yes/no" for the availability slot)

# 3. Predict what information shall be requested

#### Once again using the results of the retrieval and likely the user's utterance and dialogue act