In [56]:
import os
import re
import ast
import json
import string
import openai
import numpy as np
import pandas as pd

from tqdm import tqdm
from langchain import PromptTemplate

from src.DST.evaluate_utils import remapping
from src.DST.dst import SLOTS_DESCRIPTIONS, SLOTS_REVERSE_REMAPPING
from src.config import CONFIG

from dataclasses import dataclass, field
from typing import Optional
from transformers import TrainingArguments
from src.DST.evaluate_utils import unpack_belief_states, fix_typos, nested_fix, remapping


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)



@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to utilize.
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={"help": "The path of the HuggingFace model."}
    )
    model_name_or_path_agent: Optional[str] = field(
        default=None,
        metadata={"help": "The path of the HuggingFace model for the agent"}
    )
    use_int8: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use int8 model or not."}
    )
    use_deepspeed: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use deepspeed model or not."}
    )
    
    

@dataclass
class DataArguments:
    """
    Arguments pertaining to the data loading and preprocessing pipeline.
    """
    dataset_name: Optional[str] = field(
        default=None,
        metadata={"help": "Train dataset path"}
    )
    dataset_names: Optional[str] = field(
        default=None,
        metadata={"help": "Train dataset paths"}
    )
    root_data_path: Optional[str] = field(
        default="./data", metadata={"help": "The path to the data directory."},
    )
    mwoz_path: Optional[str] = field(
        default="/home/willy/instructod/MultiWOZ_2.1/",
        metadata={"help": "MWOZ path"}
    )
    dialog_history_limit_dst: Optional[int] = field(
        default=0,
        metadata={"help": "Lenght of dialogue history for dst"}
    )
    dialog_history_limit_dst_recorrect: Optional[int] = field(
        default=0,
        metadata={"help": "Lenght of dialogue history for dst update"}
    )
    dialog_history_limit_rg: Optional[int] = field(
        default=20,
        metadata={"help": "Lenght of dialogue history for response generation"}
    )
    dialog_history_limit_e2e: Optional[int] = field(
        default=20,
        metadata={"help": "Lenght of dialogue history for e2e"}
    )
    single_domain_only: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to keep only the single domain sample or not"}
    )
    with_slot_description: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to use slot description or not for DST"}
    )
    with_slot_domain_diff: Optional[bool] = field(
        default=False,
        metadata={"help": "differentiation between slot and domain"}
    )
    with_all_slots: Optional[bool] = field(
        default=True,
        metadata={"help": "Whether to use all slots or not"}
    )
    debug_mode: Optional[bool] = field(
        default=False,
        metadata={"help": "debug mode to only try 20 samples"}
    )
    start_idx: Optional[int] = field(
        default=0,
        metadata={"help": "Starting index to restart the prediction if needed"}
    )
    save_path: Optional[str] = field(
        default="results/",
        metadata={"help": "save path"}
    )
    save_every: Optional[int] = field(
        default=5,
        metadata={"help": "every step to save in case api fail"}
    )
    db_format_type: Optional[str] = field(
        default="1",
        metadata={"help": "1 is more precise, 2 is more concise for db integration"},
    )
    load_path: Optional[str] = field(
        default="results/",
        metadata={"help": "load path"}
    )
    agent_max_iterations: Optional[int] = field(
        default=5,
        metadata={"help": "Max number of iterations for agents in e2e (higher=better but more expensive)"}
    )
    verbose: Optional[bool] = field(
        default=False,
        metadata={"help": "verbosity for agent call in database retrieval"}
    )
    do_inference: Optional[bool] = field(
        default=False,
        metadata={"help": "use to do inference with the e2e agent setting"}
    )
    accumulate_bs: Optional[bool] = field(
        default=False,
        metadata={"help": "evaluation setting to accumulate all turn-level bs"}
    )   
    with_slot_filtering: Optional[bool] = field(
        default=False,
        metadata={"help": "slot filtering during DST eval (filter non-existent slots)"}
    )   
    

@dataclass
class PromptingArguments(TrainingArguments):
    """
    Arguments pertraining to the prompting pipeline.
    """
    output_dir: Optional[str] = field(
        default="./out",
        metadata={"help": "Output directory"},
    )
    task: Optional[str] = field(
        default="dst",
        metadata={"help": "Task to perform"}
    )
    max_requests_per_minute: Optional[int] = field(
        default=20,
        metadata={"help": "Max number of requests for OpenAI API."}
    )
    openai_api_key_name: Optional[str] = field(
        default="OPENAI_API_KEY",
        metadata={"help": "OpenAI API key name."}
    )

class PromptConstructor():
    def __init__(self, 
                 config):
        self.config = config
        self.instructions = config["INSTRUCTIONS"]
        self.prompt_templates = config["PROMPT_TEMPLATES"]
        self.examples = config["EXAMPLES"]
        
    def _get_slots_from_domains(self, domains, ontology, with_slot_description, with_all_slots, with_slot_domain_diff):
        
        if with_all_slots:
            domains = ["restaurant", "train", "attraction", "hotel", "taxi"]
        
        slots = []
        for slot in list(ontology.keys()):
            splitted_slot = slot.split("-")
            if splitted_slot[0] in domains:
                if with_slot_domain_diff:
                    if splitted_slot[-1] not in slots:
                        slots.append(splitted_slot[-1])
                else:
                    slots.append(splitted_slot[0] + "-" + splitted_slot[-1])
        
        slots_info = []
        added_slots = []
        if with_slot_description:
            for slot in slots:
                splitted_slot = slot.split("-")
                if with_slot_domain_diff:
                    if slot in added_slots:
                        continue
                    slots_info.append(f"name: {slot}, description: {SLOTS_DESCRIPTIONS[slot.lower()]}")
                    added_slots.append(slot)
                else:
                    slots_info.append(f"name: {slot}, description: {SLOTS_DESCRIPTIONS[splitted_slot[1].lower()]}")

                    
            slots = slots_info
        
        slots_prompt = "\n".join(slots)
        if with_slot_domain_diff:
            return slots_prompt + f"\n\nDOMAINS: {', '.join(domains)}"
        else:
            return slots_prompt
            
                
                
#         if with_all_slots:
#             domains = "all"
        
#         if with_slot_description:
#             with_req_inf_differentiation = False #Slot description is the discriminator

#         if domains == "all":
#             if with_req_inf_differentiation:
#                 req_slots = ", ".join(self.config["multiwoz21"]["all_requestable_slots"])
#                 inf_slots = ", ".join(self.config["multiwoz21"]["all_informable_slots"])
#             else:
#                 slots = set(self.config["multiwoz21"]["all_requestable_slots"] + 
#                             self.config["multiwoz21"]["all_informable_slots"])
#                 slots = ", ".join(slots)
#         elif not isinstance(domains, list):
#             raise ValueError("""Provided domain should be either 'all' or list of valid domain names:
#                                 - for multiwoz2.1 and 2.4: taxi, restaurant, hotel, train, attraction""")
#         else:
#             req_slots = ""
#             inf_slots = ""
#             domain_req_slots = []
#             domain_inf_slots = []
#             for domain in domains:
#                 domain_req_slots += self.config["multiwoz21"]["requestable_slots"][domain]
#                 domain_inf_slots += self.config["multiwoz21"]["informable_slots"][domain]
#             if with_req_inf_differentiation:
#                 domain_req_slots = set(domain_req_slots)
#                 domain_inf_slots = set(domain_inf_slots)
#                 req_slots += ", ".join(domain_req_slots)
#                 inf_slots += ", ".join(domain_inf_slots)
#             else:
#                 slots = set(domain_req_slots + domain_inf_slots)
#                 slots = ", ".join(slots)

#         if with_req_inf_differentiation:
#             slots_info = f"Requestable slots: {req_slots}\nInformable slots: {inf_slots}"
#         else:
#             slots_info = f"{slots}"

#         if with_slot_description:
#             slots = slots.split(", ")
#             slots_info = ""
#             for slot in slots:
#                 if slot not in self.config["multiwoz21"]["all_informable_slots"]:
#                     continue
#                 slots_info += f"name: {slot}, description: {SLOTS_DESCRIPTIONS[slot]}\n"
#             slots_info = slots_info[:-2]
        
#         return slots_info
    
    
    def _build_prompt(self, mode="", example="", dialogue_context="", ontology="", slots="", dialogue_acts="", belief_states="", database=""):
        prompt = ""
        if mode == "dst":
            instruction = self.instructions["instruction_with_slots"]
            template_variables = self.prompt_templates["template_with_slots"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                     slots=slots,
                                     example=example,
                                     dialogue_context=dialogue_context)
            
        elif mode == "dst_recorrect":
            instruction = self.instructions["instruction_with_slots_recorrect"]
            template_variables = self.prompt_templates["template_with_slots_recorrect"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])            
            prompt = template.format(instruction=instruction,
                                    slots=slots,
                                    dialogue_context=dialogue_context,
                                    belief_states=belief_states)
            
        elif mode == "database_query":
            instruction = self.instructions["instruction_query_database"]
            template_variables = self.prompt_templates["template_query_database"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                     belief_states=belief_states)
            
        elif mode == "response_generation":
            example = self.config["EXAMPLES"]["response_generation"]
            
            instruction = self.instructions["instruction_response_generation"]
            template_variables = self.prompt_templates["template_response_generation"]
            template = PromptTemplate(input_variables = template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                     example=example,
                                     dialogue_context=dialogue_context)
        elif mode == "e2e":
            instruction = self.instructions["instruction_e2e"]
            template_variables = self.prompt_templates["template_e2e"]
            template = PromptTemplate(input_variables = template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                     database=database,
                                     dialogue_context=dialogue_context)

        else:
            raise ValueError("'mode' should be one of: [dst, dst_recorrect, database_query, response_generation, e2e]")
        
        return prompt


class MWOZ_Dataset(PromptConstructor):
    def __init__(self,
                 config,
                 data_args):
        PromptConstructor.__init__(self, config)
        self.dataset = {"id":[],
                        "dialogue_id":[],
                        "dialogue_context":[],
                        "turn":[],
                        "prompt_dst":[],
                        "prompt_dst_update":[],
                        "prompt_rg":[],
                        "prompt_e2e":[],
                        "domains":[],
                        "turn_domain":[],
                        "gold_turn_bs":[],
                        "gold_bs":[],
                        "gold_act":[],
                        "gold_response":[],
                        "gold_database_result":[],
                        }
        
        print("Loading data...")
        self.all_data, self.testfiles, self.system_acts, self.ontology = self._get_mwoz_data(data_args.mwoz_path)
        print("Loading databases...")
        self.dbs_lexicalized = self._get_dbs_lexicalized(data_args.mwoz_path, data_args.db_format_type)
        self.idx = 0
        self.dialog_history_limit_dst = data_args.dialog_history_limit_dst
        self.dialog_history_limit_rg = data_args.dialog_history_limit_rg
        self.dialog_history_limit_e2e = data_args.dialog_history_limit_e2e
        self.single_domain_only = data_args.single_domain_only
        self.with_slot_description = data_args.with_slot_description
        self.with_slot_domain_diff = data_args.with_slot_domain_diff
        self.with_all_slots = data_args.with_all_slots
        self.all_domains = ["restaurant", "taxi", "hotel", "train", "attraction"]

        print("Processing mwoz...")
        for sample in tqdm(self.all_data):
            if sample in self.testfiles:
                dialogue_log = self.all_data[sample]["log"]
                self._process_dialogue_log(sample=sample,
                                           dialogue_log=dialogue_log)

        self.dataset = pd.DataFrame(self.dataset)
        if self.single_domain_only:
            for index, row in tqdm(self.dataset.iterrows()):
                if "sng" not in row["dialogue_id"].lower():
                    self.dataset.drop(index, inplace=True)

        for index, row in self.dataset.iterrows():
            if row["turn_domain"] == "":
                self.dataset.loc[index, 'turn_domain'] = row["domains"][0]

                    
    def _get_mwoz_data(self, mwoz_path):
        data_path = os.path.join(mwoz_path, "data.json")
        testListFile_path = os.path.join(mwoz_path, "testListFile.txt")
        system_acts_path = os.path.join(mwoz_path, "system_acts.json")
        ontology_path = os.path.join(mwoz_path, "ontology.json")

        with open(data_path, "r") as f:
            all_data = json.load(f)
            
        with open(testListFile_path, "r") as f:
            testfiles = f.read()
        testfiles = testfiles.split("\n")
        
        with open(system_acts_path, "r") as f:
            system_acts = json.load(f)
            
        with open(ontology_path, "r") as f:
            ontology = json.load(f)
            
        return all_data, testfiles, system_acts, ontology
    
    def _get_dbs_lexicalized(self, mwoz_path, format_type):
        domains = ["restaurant", "hotel", "train", "attraction"]
        keep_data = {"restaurant":["address", "area", "food", "name", "pricerange", "phone", "postcode"],
                    "attraction":["name", "area", "address", "type", "postcode"],
                    "hotel":["name", "address", "area", "phone", "postcode", "pricerange", "stars"],
                    "train":["departure", "destination"]}
        dbs_lexicalized = {}
        for domain in domains:
            db_path = os.path.join(mwoz_path, f"{domain}_db.json")
            with open(db_path, "r") as f:
                db_data = json.load(f)

            db_lexicalized = []
            if format_type == "1":
                for row in db_data:
                    row_keep = []
                    for key in keep_data[domain]:
                        if key in row:
                            row_keep.append(f"{key}: {row[key]}")
                    db_lexicalized.append(", ".join(row_keep))
            
            elif format_type == "2":
                #more concise db to fit in context length limit
                db_lexicalized.append(", ".join(keep_data[domain]))
                for row in db_data:
                    row_keep = []
                    for key in keep_data[domain]:
                        if key in row:
                            row_keep.append(f"{row[key]}")
                    db_lexicalized.append(", ".join(row_keep))
                    # db_lexicalized.append(", ".join([f"{row[key]}" for key in keep[domain]]))
            dbs_lexicalized[domain] = "\n".join(set(db_lexicalized))

        return dbs_lexicalized
    
    def _process_dialogue_log(self, sample, dialogue_log):

        dialog_history_memory_dst = []
        dialog_history_memory_rg = []
        dialog_history_memory_e2e = []
        dialog_history_dst = ""
        dialog_history_rg = ""
        dialog_history_e2e = ""
        turn_domain = ""
        domains = self._get_domains_from_log(dialogue_log)
        slots = self._get_slots_from_domains(domains=domains, 
                                             ontology=self.ontology,
                                             with_slot_description=self.with_slot_description,
                                             with_slot_domain_diff=self.with_slot_domain_diff,
                                             with_all_slots=self.with_all_slots) # or all
        if self.dialog_history_limit_dst == 0:
            example = self.examples["dst_dh0"]
        else:
            example = self.examples["dst_dh-1"]

        for turn_nb, turn in enumerate(dialogue_log):

            if turn_nb % 2 == 0:
                speaker = "USER"
            else:
                speaker = "SYSTEM"
            
            utterance = f"""{speaker}: {turn["text"]}\n"""
            dialog_act = turn["dialog_act"]
            cur_system_act = self.system_acts[sample.split(".")[0]][str((turn_nb//2)+1)]
            
            dialogue_context_dst = dialog_history_dst + utterance
            prompt_dst = self._build_prompt(mode="dst",
                                            slots=slots,
                                            example=example,
                                            dialogue_context=dialogue_context_dst)
            
            lexicalized_act = self._lexicalize_act(cur_system_act)
            dialogue_context_rg = dialog_history_rg + utterance + f"ACT:{lexicalized_act}\nSYSTEM:"
            prompt_rg = self._build_prompt(mode="response_generation",
                                            dialogue_context=dialogue_context_rg)
            
            dialogue_context_e2e = dialog_history_e2e + utterance + "SYSTEM:"
    
            turn_domain = self._get_domain_from_turn(turn_domain, cur_system_act)
            if turn_domain and turn_domain != "taxi":
                database = self.dbs_lexicalized[turn_domain]
            else:
                database = ""
            prompt_e2e = self._build_prompt(mode="e2e",
                                            database=database,
                                            dialogue_context=dialogue_context_e2e).replace("\n\n\n", "\n")

            dialog_history_dst, dialog_history_memory_dst = self._update_dialogue_memory(utterance, 
                                                                                         dialogue_log, 
                                                                                         self.dialog_history_limit_dst, 
                                                                                         dialog_history_memory_dst)
            dialog_history_rg, dialog_history_memory_rg = self._update_dialogue_memory(utterance, 
                                                                                       dialogue_log, 
                                                                                       self.dialog_history_limit_rg,
                                                                                       dialog_history_memory_rg)
            dialog_history_e2e, dialog_history_memory_e2e = self._update_dialogue_memory(utterance, 
                                                                                         dialogue_log, 
                                                                                         self.dialog_history_limit_e2e, 
                                                                                         dialog_history_memory_e2e) 
                
            metadata = turn["metadata"]
            bspn = {}
            if metadata:
                for domain in domains:
                    for k, v in metadata[domain].items():
                        for slot, value in v.items():
                            if isinstance(value, str) and value not in ["", "not mentioned", "none"]:
                                bspn[domain+"-"+slot] = value
            self.idx += 1
            if turn_nb % 2 == 0:
                self.dataset["gold_turn_bs"].append(dialog_act)
                self.dataset["dialogue_context"].append(dialogue_context_dst)
                self.dataset["gold_database_result"].append(None) 
                self.dataset["turn"].append(turn_nb//2)
                self.dataset["domains"].append(domains)
                self.dataset["id"].append(self.idx//2)
                self.dataset["dialogue_id"].append(sample)
                self.dataset["prompt_dst"].append(prompt_dst)
                self.dataset["prompt_dst_update"].append(prompt_dst)
                self.dataset["prompt_rg"].append(prompt_rg)
                self.dataset["prompt_e2e"].append(prompt_e2e)
                self.dataset["turn_domain"].append(turn_domain)
            else:
                self.dataset["gold_response"].append(utterance)
                self.dataset["gold_bs"].append(bspn)
                self.dataset["gold_act"].append(dialog_act)

    def _update_dialogue_memory(self, utterance, dialogue_log, dialog_history_limit, dialog_history_memory):
        if dialog_history_limit != 0:
            if dialog_history_limit == -1:
                dialog_history_limit = len(dialogue_log)
            if len(dialog_history_memory) >= dialog_history_limit:
                dialog_history_memory.pop(0)
            dialog_history_memory.append(utterance)

        dialog_history = "".join(dialog_history_memory)
        return dialog_history, dialog_history_memory
    
    def _lexicalize_act(self, act):
        if act == "No Annotation":
            return "None"
        
        lexicalized_acts = []
        lexicalize_mapping = {"leave": "leave time",
                              "arrive":"arrival time",
                              "departure":"departure place",
                              "post":"postcode",
                              "addr":"address"}

        for act, slot_values in act.items():


            if "request" in act.lower():
                requests = []
                for (slot, value) in slot_values:
                    slot = slot.lower()
                    if slot in lexicalize_mapping:
                        slot = lexicalize_mapping[slot]
                    if slot == "none":
                        break
                    else:
                        requests.append(slot)
                if requests:
                    lexicalized_act = "Request the user about " + ", ".join(requests) + "."
                    lexicalized_acts.append(lexicalized_act)

            elif "recommend" in act.lower():
                recommends = []
                for (slot, value) in slot_values:
                    slot, value = slot.lower(), value.lower()
                    if slot in lexicalize_mapping:
                        slot = lexicalize_mapping[slot]
                    if slot == "none":
                        break
                    else:
                        recommends.append(value)
                if recommends:
                    lexicalized_act = "Recommend the user for " + ", ".join(recommends) + "."
                    lexicalized_acts.append(lexicalized_act)

            elif "inform" in act.lower():
                informs = []
                for (slot, value) in slot_values:
                    slot, value = slot.lower(), value.lower()
                    if slot in lexicalize_mapping:
                        slot = lexicalize_mapping[slot]
                    if slot == "none":
                        break
                    else:
                        informs.append(f"the {slot} is {value}")
                if informs:
                    lexicalized_act = "Inform the user that " + ", ".join(informs) + "."  
                    lexicalized_acts.append(lexicalized_act)

            else:
                pass
        if lexicalized_acts:
            return " ".join(lexicalized_acts)
        else:
            return "None"
        
    def _get_domain_from_turn(self, domain, act):
        for k in act:
            turn_domain = k.lower().split("-")[0]
            if turn_domain in self.all_domains:
                return turn_domain
        return domain
            

    def _get_domains_from_log(self, dialogue_log):
        domains = []
        for log in dialogue_log:
            for domain_act in log["dialog_act"]:
                domain = domain_act.split("-")[0].lower()
                if domain in self.all_domains and domain not in domains:
                    domains.append(domain)
        return domains
                
                
def evaluate_dst(results_df, vocal=True, save_path=None):
    global_turns = 0    
    global_jga = 0
    results_single_domain = {"taxi":{"turns":0, "correct_turns_jga":0, "correct_slots":0, "total_slots":0, "slot_f1":0},
                            "restaurant":{"turns":0, "correct_turns_jga":0, "correct_slots":0, "total_slots":0, "slot_f1":0},
                            "hotel":{"turns":0, "correct_turns_jga":0, "correct_slots":0, "total_slots":0, "slot_f1":0},
                            "train":{"turns":0, "correct_turns_jga":0, "correct_slots":0, "total_slots":0, "slot_f1":0},
                            "attraction":{"turns":0, "correct_turns_jga":0, "correct_slots":0, "total_slots":0, "slot_f1":0},
                            "all":{"global_turns":0, "global_f1":0}}
    
    for _, row in results_df.iterrows():
        unpacked_gold = unpack_belief_states(row["gold_bs"], "gold")
        unpacked_pred = unpack_belief_states(row["preds"], "pred")
        domains = row["domains"]
        if isinstance(domains, str):
            domains = ast.literal_eval(domains)

        if set(unpacked_gold)==set(unpacked_pred):
            global_jga += 1
            if len(domains) == 1:
                results_single_domain[domains[0]]["correct_turns_jga"] += 1

        gold_values = [gold.split("-")[1] for gold in unpacked_gold]
        pred_values = [pred.split("-")[1] for pred in unpacked_pred]
        F1, recall, precision = compute_prf(gold_values, pred_values)
        if len(domains) == 1:
            results_single_domain[domains[0]]["slot_f1"] += F1
            results_single_domain[domains[0]]["turns"] += 1
        results_single_domain["all"]["global_f1"] += F1
        results_single_domain["all"]["global_turns"] += 1
        global_turns += 1

    total_single_domain_jga = 0
    total_single_domain_turns = 0
    for domain in results_single_domain:
        if domain == "all":
            continue
        domain_slot_f1 = results_single_domain[domain]["slot_f1"]
        domain_jga = results_single_domain[domain]["correct_turns_jga"]
        domain_turns = results_single_domain[domain]["turns"]
        total_single_domain_jga += domain_jga
        total_single_domain_turns += domain_turns
        results_single_domain[domain]["JGA"] = domain_jga/domain_turns
        results_single_domain[domain]["SLOT-F1"] = domain_slot_f1/domain_turns

        if vocal:
            print(f"""For {domain}, JGA: {results_single_domain[domain]["JGA"]} - SLOT-F1: {results_single_domain[domain]["SLOT-F1"]}""")
    jga_single_domain_average = total_single_domain_jga/total_single_domain_turns
    jga_average = global_jga/global_turns    
    slot_f1_average = results_single_domain["all"]["global_f1"] / results_single_domain["all"]["global_turns"]
    if vocal:
        print(f"""Average JGA in single domain samples only: {jga_single_domain_average}""")
        print(f"""Average JGA overall: {jga_average}""")
        print(f"""Average Slot F1 Overall: {slot_f1_average}""")

    results = results_single_domain
    results["JGA_single_domain_average"] = jga_single_domain_average
    results["JGA_average"] = jga_average

    return results


def completion(model, prompt):            
    completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
    response = completion.choices[0].message.content.strip()
    return response    

In [57]:
model_args = ModelArguments()
data_args = DataArguments()
model_args.model_name_or_path_agent = "openai/gpt-3.5-turbo"
data_args.single_domain_only = True
data_args.dialog_history_limit_dst = 0
data_args.dialog_history_limit_e2e = -1
data_args.dialog_history_limit_rg = -1
data_args.with_slot_domain_diff = False
data_args.with_all_slots = False
data_args.with_slot_description = False

#load mwoz21
mwoz = MWOZ_Dataset(CONFIG, data_args)
dataset_single = mwoz.dataset
dataset_single['domain_length'] = dataset_single['domains'].apply(lambda x: len(x))

data_args.single_domain_only = False

def get_subset_multi(df):
    df['domain_length'] = df['domains'].apply(lambda x: len(x))
    filtered_df = df.loc[(df['domain_length'] == 2)].head(500).append(df.loc[(df['domain_length'] == 3)].head(500))
    return filtered_df

mwoz = MWOZ_Dataset(CONFIG, data_args)
dataset_multi = mwoz.dataset
dataset_multi = get_subset_multi(dataset_multi)

dataset = dataset_single.append(dataset_multi)

Loading data...


Loading databases...
Processing mwoz...


100%|██████████| 10438/10438 [00:02<00:00, 4007.22it/s]
7372it [00:08, 882.06it/s] 


Loading data...
Loading databases...
Processing mwoz...


100%|██████████| 10438/10438 [00:02<00:00, 4044.21it/s]
  filtered_df = df.loc[(df['domain_length'] == 2)].head(500).append(df.loc[(df['domain_length'] == 3)].head(500))
  dataset = dataset_single.append(dataset_multi)


In [2]:
galaxy = json.load(open("/home/willy/instructod/src/e2e/baselines/galaxy-e2e.json", "r"))

In [3]:
df = pd.read_csv("/home/willy/instructod/src/e2e/results/instructod_results.csv")

In [None]:
# galaxy["sng0073"]
# df_e2e = pd.merge(dataset, df[["id", "preds"]], on=["id"], how="right")


In [4]:
dialogue_ids = []
responses = []
turns = []
for k, v in galaxy.items():
    dialogue_id = k.upper() + ".json"
    for idx, resp in enumerate(v):
        response = resp["response"]
        turn = idx
        dialogue_ids.append(dialogue_id)
        responses.append(response)
        turns.append(turn)

In [None]:
galaxy_df = pd.DataFrame({"turn":turns,
                          "dialogue_id":dialogue_ids,
                          "galaxy_preds":responses,
                          "dialogue_context_full":L})

In [69]:
df = pd.read_csv("/home/willy/instructod/src/RG/results/gpt-4_rg_full_output.csv")

In [70]:
df = df[df["dialogue_id"].str.contains("SNG")]

In [15]:
df["dialogue_context_full"] = df["prompt_e2e"].apply(lambda x: x.split("\n\n")[-1][:-8])

In [22]:
df = df.rename(columns={"preds":"rg_preds"})

In [24]:
df.columns

Index(['Unnamed: 0', 'id', 'dialogue_id', 'dialogue_context', 'turn',
       'prompt_dst', 'prompt_dst_update', 'prompt_rg', 'prompt_e2e', 'domains',
       'turn_domain', 'gold_turn_bs', 'gold_bs', 'gold_act', 'gold_response',
       'gold_database_result', 'rg_preds', 'dialogue_context_full'],
      dtype='object')

In [25]:
df[["dialogue_id", "dialogue_context_full", "rg_preds"]].to_csv("gpt-3.5-turbo_rg.csv")

In [10]:
df.columns

Index(['Unnamed: 0', 'id', 'dialogue_id', 'dialogue_context', 'turn',
       'prompt_dst', 'prompt_dst_update', 'prompt_rg', 'prompt_e2e', 'domains',
       'turn_domain', 'gold_turn_bs', 'gold_bs', 'gold_act', 'gold_response',
       'gold_database_result', 'preds'],
      dtype='object')

In [None]:
# df_e2e = pd.merge(dataset, df[["id", "preds"]], on=["id"], how="right")


In [7]:
merged = pd.merge(galaxy_df, df, on=["turn", "dialogue_id"], how="right")

In [8]:
merged.columns

Index(['turn', 'dialogue_id', 'galaxy_preds', 'Unnamed: 0', 'id',
       'dialogue_context', 'prompt_dst', 'prompt_dst_update', 'prompt_rg',
       'prompt_e2e', 'domains', 'turn_domain', 'gold_turn_bs', 'gold_bs',
       'gold_act', 'gold_database_result', 'prompts_e2e_query_db',
       'preds_e2e_query_db', 'preds_e2e_dialog_acts', 'prompts_e2e_rg',
       'naive_preds', 'pptod_preds', 'rg_preds', 'instructod_preds',
       'gold_response'],
      dtype='object')

In [47]:
L = []
for sample in list(merged["prompt_e2e"]):
    if sample is not np.nan:
        L.append(sample.split("\n\n")[-1][:-9])
    else:
        L.append("Bug")


In [40]:
list(merged["prompt_e2e"])[5].split("\n\n")[-1][:-9]

'USER: Are there any Portuguese restaurants in Cambridge?\nSYSTEM: Yes there is a Portuguese restaurant in Cambridge  with two different locations, would you like the addresses?\nUSER: If one of them has a moderate price range please give me that address. If not tell me about Turkish restaurants instead'

In [17]:
merged["prompt_e2e"][0].split("\n\n")[1].split("\n")[0]

"USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton."

In [53]:
merged[["id", "dialogue_context", "naive_preds", "pptod_preds", "rg_preds", "instructod_preds", "gold_response", "galaxy_preds"]].to_csv("/home/willy/instructod/src/e2e/results/instructod_results_with_galaxy.csv")

In [51]:
merged["dialogue_context"] = L

In [52]:
merged.to_csv("/home/willy/instructod/src/e2e/results/instructod_results_with_galaxy.csv")

In [7]:
df.columns

Index(['Unnamed: 0', 'id', 'dialogue_id', 'dialogue_context', 'turn',
       'prompt_dst', 'prompt_dst_update', 'prompt_rg', 'prompt_e2e', 'domains',
       'turn_domain', 'gold_turn_bs', 'gold_bs', 'gold_act',
       'gold_database_result', 'prompts_e2e_query_db', 'preds_e2e_query_db',
       'preds_e2e_dialog_acts', 'prompts_e2e_rg', 'naive_preds', 'pptod_preds',
       'rg_preds', 'instructod_preds', 'gold_response'],
      dtype='object')

In [5]:
#Response Generation with oracle systen actions
df_rg = pd.read_csv("/home/willy/instructod/src/RG/results/gpt-4_rg_full_output.csv")
#Response Generation with e2e single turn
df_e2e = pd.read_csv("/home/willy/instructod/src/e2e/results/gpt-3.5-turbo_e2e_full_output.csv")
df_e2e = pd.merge(dataset, df_e2e[["id", "preds"]], on=["id"], how="right")
#Response Generation with e2e multi turn
df_e2e_agent = pd.read_csv("/home/willy/instructod/src/e2e/results/gpt-3.5+4-turbo_e2e_agents_full_output.csv")
df_e2e_agent = pd.merge(dataset, df_e2e_agent[["id", "preds", "preds_e2e_dialog_acts"]], on=["id"], how="right")
#Response generation from PPTOD baseline
pptod_e2e = json.load(open("/home/willy/instructod/pptod/E2E_TOD/inference_result/base/full_training/inference_result_e2e_evaluation_inform_89.2_success_79.4_bleu_18.62_combine_score_102.92.json", "r"))
pptod_e2e = pd.DataFrame([dict(item, **{"id":idx}) for idx, item in enumerate(pptod_e2e)])
pptod_e2e = pptod_e2e.rename(columns={"resp_gen":"preds"})

In [6]:
df_e2e_agent["prompt_e2e"][2].split("\n\n")[-1][:-9]

"USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.\nSYSTEM: What time do you want to leave and what time do you want to arrive by?\nUSER: I want to leave after 17:15.\nSYSTEM: \nBooking completed! your taxi will be blue honda Contact number is 07218068540\nUSER: Thank you for all the help! I appreciate it"

In [7]:
count = 0
dialogue_contexts = []
golds = []
pptod_resps = []
rg_resps = []
e2e_single_resps = []
e2e_multi_resps = []
for idx, row in df_e2e_agent.iterrows():
    sample_id = row["id"]
    row_e2e = df_e2e.loc[df_e2e["id"] == sample_id]
    row_rg = df_rg.loc[df_rg["id"] == sample_id]
    row_pptod_e2e = pptod_e2e.loc[pptod_e2e["id"] == sample_id]
    # if "none" in str(row["preds_e2e_dialog_acts"]):
    #     continue
    # print("context        :", row["dialogue_context"])
    dialogue_context = row["prompt_e2e"].split("\n\n")[-1][:-9]
    print("context        :", dialogue_context)
    print("dialogue id", row["dialogue_id"])
    print("gold           :", row["gold_response"][8:])
    print("pptod          :", row_pptod_e2e["preds"].item())
    print("rg             :", row_rg["preds"].item())
    print("e2e_single     :", row_e2e["preds"].item())
    print("e2e_multi      :", row["preds"])
    print("----------")
    dialogue_contexts.append(dialogue_context)
    golds.append(row["gold_response"][8:])
    pptod_resps.append(row_pptod_e2e["preds"].item())
    rg_resps.append(row_rg["preds"].item())
    e2e_single_resps.append(row_e2e["preds"].item())
    e2e_multi_resps.append(row["preds"])
    count += 1
# print(f"Printed {count} results")

context        : USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton
dialogue id SNG0073.json
gold           : What time do you want to leave and what time do you want to arrive by?

pptod          : i can help with that . what time would you like to leave ?
rg             : When would you like to leave from Saint John's College and what time do you want to arrive at Pizza Hut Fen Ditton?
e2e_single     : What time would you like the taxi?
e2e_multi      : Sure, I can help you with that. Would you like me to confirm the booking for you?
----------
context        : USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.
SYSTEM: What time do you want to leave and what time do you want to arrive by?
USER: I want to leave after 17:15
dialogue id SNG0073.json
gold           : 
Booking completed! your taxi will be blue honda Contact number is 07218068540

pptod          : i have booked you a [value_car] . the contact number is [value_phone] .
rg  

AttributeError: 'float' object has no attribute 'split'

In [8]:
#For HE
# comparison_df = pd.DataFrame({"dialogue_context":dialogue_contexts,
#                             "gold_responses":golds,
#                             "pptod_responses":pptod_resps,
#                             "rg_responses":rg_resps,
#                             "e2e_single_responses":e2e_single_resps,
#                             "e2e_multi_responses":e2e_multi_resps,
#                             })

# Inform Success BLEU Eval

In [18]:
domains = ["restaurant", "hotel", "train", "attraction"]
keep_data = {"restaurant":["address", "name", "food", "area", "pricerange", "phone", "postcode"],
            "attraction":["name", "area", "address", "type", "postcode", "entrance fee"],
            "hotel":["name", "address", "area", "phone", "postcode", "pricerange", "stars", "internet", "parking", "type"],
            "train":["departure", "destination", "arriveBy", "day", "leaveAt", "price", "trainID", "duration"]}
dbs = {}
for domain in domains:
    db_path = os.path.join(data_args.mwoz_path, f"{domain}_db.json")
    with open(db_path, "r") as f:
        db_data = json.load(f)
    db = {}
    for d in db_data: 
        for k, v in d.items():
            if k in keep_data[domain]:
                if k in db:
                    if v not in db[k]:
                        db[k].append(v.lower())
                else:
                    db[k] = [v.lower()]
    dbs[domain] = db
    
with open("/home/willy/instructod/MultiWOZ_2.1/ontology.json", "r") as f:
    db_data = json.load(f)
taxi_slots = ["departure", "destination", "arriveBy", "leaveAt"]
book_slots = {"restaurant":["time", "day", "people"],
              "hotel":["day", "people", "stay"],
              "train":["people"]}

dbs["taxi"] = {}
for slot in taxi_slots:
    dbs["taxi"][slot] = db_data[f"taxi-semi-{slot}"]
dbs["taxi"]["car"] = ['toyota','skoda','bmw','honda','ford','audi','lexus','volvo','volkswagen','tesla']
dbs["taxi"]["color"] = ['black','white','red','yellow','blue','grey']

    
for domain, slots in book_slots.items():
    for slot in slots:
        if slot == "people":
            dbs[domain][slot] = [value+" people" for value in db_data[f"{domain}-book-{slot}"]] + [value+" person" for value in db_data[f"{domain}-book-{slot}"]]
        else:
            if db_data[f"{domain}-book-{slot}"] not in ["any", ]:
                dbs[domain][slot] = db_data[f"{domain}-book-{slot}"]

for domain in domains:
    if domain == "train":
        continue
    reordered = {k:v for k, v in dbs[domain].items() if k == "name"}
    for k, v in dbs[domain].items():
        if k != "name":
            reordered[k] = v
    dbs[domain] = reordered

In [19]:
df = pd.read_csv("src/e2e/results/gpt-3.5-turbo_e2e_agents_full_output.csv")
df = pd.merge(df[["id", "preds"]], dataset, on=["id"])

In [20]:
idx = 1
print("context", df["prompt_e2e"][idx].split("\n\n")[-1][:-8])
print("gold", df["gold_response"][idx])
print("pred", df["preds"][idx])
print("-----")

context USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.
SYSTEM: What time do you want to leave and what time do you want to arrive by?
USER: I want to leave after 17:15.
gold SYSTEM: 
Booking completed! your taxi will be blue honda Contact number is 07218068540

pred Sure, I can arrange a taxi for you to leave after 17:15. What time would you like to arrive at Pizza Hut Fen Ditton?
-----


In [21]:
pred = df["preds"][idx].lower()
pred

'sure, i can arrange a taxi for you to leave after 17:15. what time would you like to arrive at pizza hut fen ditton?'

In [22]:
VALUES_FIX = {"fen ditton":"fenditton",
              "john's":"johns", "catherine's":"catherines",
              "the bridge guest":"bridge guest", "the rajmahal": "rajmahal", "the bedouin":"bedouin",
              "ian hong":"lan hong", "pizza express":"pizza hut",
              "express by holiday inn cambridge":"inn cambridge", "alpha-milton":"alpha-milton guest house", "el shaddai":"el shaddai guesthouse",
              "bringham new street":"birmingham new street",
              "king's lynn":"kings lynn",
              "nightclub":"night club", "concert hall":"concerthall", "guest house guest house":"guest house",
              "kettle's yard":"kettles yard",
              "3 00":"03:00", "9:30":"09:30", "2:30":"02:30", "1515hrs":"15:15", "9:15":"09:15", "109:30":"19:30", "9:45":"09:45", "7:15 p.m.":"07:15", "5:15":"05:15",
              "009:15":"09:15", "009:30":"09:30", "109:30":"19:30", "109:15":"19:15", "102:30":"12:30",
              "after ":"", " nights":"",
              "town centre":"centre",
             
              "free":"yes", "any":"dontcare"}

def delexicalize(df, dbs, delex_column="preds"):
    delex_preds = []
    phone_pattern = r"\d{11}"
    for idx, row in tqdm(df.iterrows()):
        pred = row[delex_column].lower()
        domain = row["turn_domain"]
        for value_fix in VALUES_FIX:
            pred = pred.replace(value_fix, VALUES_FIX[value_fix])
        pred = re.sub(phone_pattern, "[value_phone]", pred)
        for k, values in dbs[domain].items():
            for v in values:
                if v in pred:
                    pred = pred.replace(v, f"[value_{k.lower()}]")
        delex_preds.append(pred)
    return delex_preds
        

In [23]:
#delex on single domain

#Response Generation with oracle systen actions
df_rg = pd.read_csv("/home/willy/instructod/src/RG/results/gpt-4_rg_full_output.csv")
df_rg = df_rg[df_rg['dialogue_id'].str.lower().str.contains('sng')]
df_rg.dropna(subset=['turn_domain'], inplace=True)

#Response Generation with e2e single turn
df_e2e = pd.read_csv("/home/willy/instructod/src/e2e/results/gpt-3.5-turbo_e2e_full_output.csv")
df_e2e = pd.merge(dataset, df_e2e[["id", "preds"]], on=["id"], how="right")
df_e2e.dropna(subset=['turn_domain'], inplace=True)
df_e2e = df_e2e[df_e2e['dialogue_id'].str.lower().str.contains('sng')]
#Response Generation with e2e multi turn
df_e2e_agent = pd.read_csv("/home/willy/instructod/src/e2e/results/gpt-3.5+4-turbo_e2e_agents_full_output.csv")
df_e2e_agent = pd.merge(dataset, df_e2e_agent[["id", "preds", "preds_e2e_dialog_acts"]], on=["id"], how="right")
df_e2e_agent.dropna(subset=['turn_domain'], inplace=True)
df_e2e_agent = df_e2e_agent[df_e2e_agent['dialogue_id'].str.lower().str.contains('sng')]
#Response generation from PPTOD baseline
pptod_e2e = json.load(open("/home/willy/instructod/pptod/E2E_TOD/inference_result/base/full_training/inference_result_e2e_evaluation_inform_89.2_success_79.4_bleu_18.62_combine_score_102.92.json", "r"))
pptod_e2e = pd.DataFrame([dict(item, **{"id":idx}) for idx, item in enumerate(pptod_e2e)])
pptod_e2e = pptod_e2e.rename(columns={"resp_gen":"preds"})
pptod_e2e = pptod_e2e[pptod_e2e['dial_id'].str.lower().str.contains('sng')]

In [24]:
delex_gold = delexicalize(dataset, "gold_response")
delex_gold = [gold.replace("\n", "").replace("system: ", "") for gold in delex_gold]
delex_df_rg = delexicalize(df_rg)
delex_df_e2e = delexicalize(df_e2e)
delex_df_e2e_agent = delexicalize(df_e2e_agent)
delex_pptod = list(pptod_e2e["preds"])

delex_resps = {"gold":delex_gold,
                "rg":delex_df_rg,
                "e2e":delex_df_e2e,
                "e2e_multi":delex_df_e2e_agent,
                "pptod":delex_pptod}
# with open("/home/willy/instructod/src/e2e/results/delex_preds.json", "w") as f:
#     json.dump(delex_resps, f, indent=4)

1053it [00:00, 5738.01it/s]
1036it [00:00, 5477.91it/s]
1053it [00:00, 5166.37it/s]
1022it [00:00, 4626.04it/s]


In [25]:
delex_pptod = list(pptod_e2e["preds"])
for i in range(200):
    print("gold       :", delex_gold[i])
    print("rg         :", delex_df_rg[i])
    print("e2e        :", delex_df_e2e[i])
    print("e2e_multi  :", delex_df_e2e_agent[i])
    print("pptod      :", delex_pptod[i])
    print("----")

gold       : what time do you want to leave and what time do you want to arrive by?
rg         : when would you like to leave from [value_departure] and what time do you want to arrive at [value_departure]?
e2e        : what time would you like the taxi?
e2e_multi  : sure, i can help you with that. would you like me to confirm the booking for you?
pptod      : i can help with that . what time would you like to leave ?
----
gold       : booking completed! your taxi will be [value_color] [value_car] contact number is [value_phone]
rg         : a [value_color] [value_car] taxi will be available for you. you can contact the driver at [value_phone].
e2e        : a taxi will be available for you at [value_arriveby] to take you to [value_departure]. estimated time of arrival is [value_arriveby]. shall i book it for you?
e2e_multi  : sure, i can arrange a taxi for you to leave [value_arriveby]. what time would you like to arrive at [value_departure]?
pptod      : i have booked you a [value_car

In [10]:
delex_preds = delexicalize(df)
for delex_pred, pred in zip(delex_preds, df["preds"]):
    print(pred)
    print(delex_pred)
    print("------")

Sure, I can help you with that. Would you like me to confirm the booking for you?
sure, i can help you with that. would you like me to confirm the booking for you?
------
Sure, I can arrange a taxi for you to leave after 17:15. What time would you like to arrive at Pizza Hut Fen Ditton?
sure, i can arrange a taxi for you to leave [value_arriveby]. what time would you like to arrive at [value_departure]?
------
You're welcome! Have a great day!
you're welcome! have a great day!
------
Thank you for using our service. Have a great day!
thank you for using our service. have a great day!
------
I'm sorry, I couldn't find any Portuguese restaurants in Cambridge. Would you like me to search for other types of restaurants in the area?
i'm sorry, i couldn't find [value_time] [value_food] restaurants in cambridge. would you like me to search for other types of restaurants in the area?
------
I'm sorry, but there are no Portuguese or Turkish restaurants in Cambridge according to our records. Wou

In [11]:
def delexicalize_dbs(data_args, ontology_path):
    domains = ["restaurant", "hotel", "train", "attraction"]
    keep_data = {"restaurant":["address", "name", "food", "area", "pricerange", "phone", "postcode"],
                "attraction":["name", "area", "address", "type", "postcode", "entrance fee"],
                "hotel":["name", "address", "area", "phone", "postcode", "pricerange", "stars", "internet", "parking", "type"],
                "train":["departure", "destination", "arriveBy", "day", "leaveAt", "price", "trainID", "duration"]}
    dbs = {}
    for domain in domains:
        db_path = os.path.join(data_args.mwoz_path, f"{domain}_db.json")
        with open(db_path, "r") as f:
            db_data = json.load(f)
        db = {}
        for d in db_data: 
            for k, v in d.items():
                if k in keep_data[domain]:
                    if k in db:
                        if v not in db[k]:
                            db[k].append(v.lower())
                    else:
                        db[k] = [v.lower()]
        dbs[domain] = db

    with open(ontology_path, "r") as f:
        db_data = json.load(f)
    taxi_slots = ["departure", "destination", "arriveBy", "leaveAt"]
    book_slots = {"restaurant":["time", "day", "people"],
                  "hotel":["day", "people", "stay"],
                  "train":["people"]}

    dbs["taxi"] = {}
    for slot in taxi_slots:
        dbs["taxi"][slot] = db_data[f"taxi-semi-{slot}"]

    for domain, slots in book_slots.items():
        for slot in slots:
            if slot == "people":
                dbs[domain][slot] = [value+" people" for value in db_data[f"{domain}-book-{slot}"]] + [value+" person" for value in db_data[f"{domain}-book-{slot}"]]
            else:
                dbs[domain][slot] = db_data[f"{domain}-book-{slot}"]

    for domain in domains:
        if domain == "train":
            continue
        reordered = {k:v for k, v in dbs[domain].items() if k == "name"}
        for k, v in dbs[domain].items():
            if k != "name":
                reordered[k] = v
        dbs[domain] = reordered
    return dbs

def delexicalize(df, delex_debs):
    delex_preds = []
    for idx, row in df.iterrows():
        pred = row["preds"]
        domain = row["turn_domain"]
        for k, values in delex_debs[domain].items():
            for v in values:
                if v in pred.lower():
                    pred = pred.lower().replace(v, f"[value_{k.lower()}]")
        delex_preds.append(pred)
    df["delex_preds"] = delex_preds
    return df

In [140]:
ontology_path = data_args.mwoz_path + "ontology.json"
delex_dbs = delexicalize_dbs(data_args, ontology_path)
new_df = delexicalize(df, delex_dbs)

## Evaluate e2e baselines

In [404]:
# dataset = pd.read_csv("src/e2e/results/instructod_results.csv")
dataset = pd.read_csv("src/e2e/results/gpt-3.5-turbo_e2e_agents_multi_full_output.csv")

In [401]:
dataset["domain_count"] = dataset["domains"].apply(lambda x: len(x))
dataset_2 = dataset[dataset['domain_count'] == 2]
dataset_3 = dataset[dataset['domain_count'] == 3]

In [405]:
# df = pd.merge(dataset_2, dataset[["id", "preds"]], on=["id"])
# df = df.drop_duplicates("id")

df = pd.merge(dataset_3, dataset[["id", "preds"]], on=["id"])
df = df.drop_duplicates("id")

In [410]:
augpt = json.load(open("src/e2e/baselines/augpt.json", "r"))
galaxy = json.load(open("src/e2e/baselines/galaxy-e2e.json", "r"))
pptod = json.load(open("src/e2e/baselines/pptod.json", "r"))
soloist = json.load(open("src/e2e/baselines/soloist.json", "r"))
ubar = json.load(open("src/e2e/baselines/ubar.json", "r"))

# dataset = pd.read_csv("src/e2e/results/instructod_results.csv")

# dataset = dataset.dropna(subset=["gold_turn_bs"])

df["dial_id"] = df["dialogue_id"].apply(lambda x: x.lower().split(".")[0])
#keep only single
# dataset = dataset[dataset['dialogue_id'].str.contains('SNG')]

keep_id = list(df["dial_id"].unique())

augpt = {k: v for k, v in augpt.items() if k in keep_id}
galaxy = {k: v for k, v in galaxy.items() if k in keep_id}
pptod = {k: v for k, v in pptod.items() if k in keep_id}
soloist = {k: v for k, v in soloist.items() if k in keep_id}
ubar = {k: v for k, v in ubar.items() if k in keep_id}

baselines = {"augpt":augpt,
             "galaxy":galaxy,
             "pptod":pptod,
             "soloist":soloist,
             "ubar":ubar}

In [12]:
dataset = df.copy()

In [413]:
for baseline_name, baseline in baselines.items():
    resps = []
    for idx, row in tqdm(dataset.iterrows()):
        dial_id = row["dial_id"]
        # if "sng" not in dial_id:
        #     continue
        turn = row["turn"]
        turn = int(turn)
        resp = baseline[dial_id][turn]["response"]
        resps.append(resp)
    dataset[f"preds_{baseline_name}"] = resps
    
dataset["preds_augpt"] = dataset["preds_augpt"].apply(lambda x: x.lower().replace("[", "[value_"))
dataset["preds_soloist"] = dataset["preds_soloist"].apply(lambda x: x.replace("[restaurant_", "[value_").replace("[attraction_", "[value_").replace("[hotel_", "[value_").replace("[train_", "[value_").replace("[taxi_", "[value_"))

500it [00:00, 12562.31it/s]
500it [00:00, 14182.60it/s]
500it [00:00, 13970.77it/s]
500it [00:00, 15118.97it/s]
500it [00:00, 17690.32it/s]


In [104]:
#RG 

for baseline_name, baseline in rg_baselines.items():
    resps = []
    print("Processing", baseline_name)
    for idx, row in tqdm(dataset.iterrows()):
        dial_id = row["dial_id"]
        # if "sng" not in dial_id:
        #     continue
        turn = row["turn"]
        turn = int(turn)
        if dial_id not in baseline:
            # print("not found") #16 in labes
            resp = None
        else:
            resp = baseline[dial_id][turn]["response"]
        resps.append(resp)
    dataset[f"preds_{baseline_name}"] = resps

dataset = dataset[~dataset['gold_turn_bs'].isna()]

# dataset["preds_augpt"] = dataset["preds_augpt"].apply(lambda x: x.lower().replace("[", "[value_"))
# dataset["preds_soloist"] = dataset["preds_soloist"].apply(lambda x: x.replace("[restaurant_", "[value_").replace("[attraction_", "[value_").replace("[hotel_", "[value_").replace("[train_", "[value_").replace("[taxi_", "[value_"))

Processing bort


1036it [00:00, 13402.56it/s]


Processing damd


1036it [00:00, 9894.79it/s]


Processing hdno


1036it [00:00, 13001.74it/s]


Processing hdas


1036it [00:00, 8047.39it/s]


Processing labes


1036it [00:00, 13869.50it/s]


Processing lava


1036it [00:00, 13914.31it/s]


Processing marco


1036it [00:00, 13954.70it/s]


Processing mintl


1036it [00:00, 14396.75it/s]


Processing pptod


1036it [00:00, 15368.32it/s]


Processing rstod


1036it [00:00, 15064.78it/s]


Processing sfn


1036it [00:00, 15274.48it/s]


Processing uniconv


1036it [00:00, 15534.90it/s]


In [19]:
def delexicalize_dbs(data_args, ontology_path):
    domains = ["restaurant", "hotel", "train", "attraction"]
    # keep_data = {"restaurant":["address", "name", "food", "area", "pricerange", "phone", "postcode"],
    #             "attraction":["name", "area", "address", "type", "postcode", "entrance fee"],
    #             "hotel":["name", "address", "area", "phone", "postcode", "pricerange", "stars", "internet", "parking", "type"],
    #             "train":["departure", "destination", "arriveBy", "day", "leaveAt", "price", "trainID", "duration"]}
    keep_data = {"restaurant":["address", "name", "food", "area", "pricerange", "phone", "postcode"],
                "attraction":["name", "area", "address", "type", "postcode"],
                "hotel":["name", "address", "area", "phone", "postcode", "pricerange", "stars", "internet", "parking", "type"],
                "train":["departure", "destination", "day", "price", "duration"]}
    dbs = {}
    for domain in domains:
        db_path = os.path.join(data_args.mwoz_path, f"{domain}_db.json")
        with open(db_path, "r") as f:
            db_data = json.load(f)
        db = {}
        for d in db_data: 
            for k, v in d.items():
                if k in keep_data[domain]:
                    if k in db:
                        if v not in db[k]:
                            db[k].append(v.lower())
                    else:
                        db[k] = [v.lower()]
        dbs[domain] = db

    with open(ontology_path, "r") as f:
        db_data = json.load(f)
    taxi_slots = ["departure", "destination", "arriveBy", "leaveAt"]
    book_slots = {"restaurant":["time", "day", "people"],
                  "hotel":["day", "people", "stay"],
                  "train":["people"]}

    dbs["taxi"] = {}
    for slot in taxi_slots:
        dbs["taxi"][slot] = db_data[f"taxi-semi-{slot}"]

    for domain, slots in book_slots.items():
        for slot in slots:
            if slot == "people":
                dbs[domain][slot] = [value+" people" for value in db_data[f"{domain}-book-{slot}"]] + [value+" person" for value in db_data[f"{domain}-book-{slot}"]]
            else:
                dbs[domain][slot] = db_data[f"{domain}-book-{slot}"]

    for domain in domains:
        if domain == "train":
            continue
        reordered = {k:v for k, v in dbs[domain].items() if k == "name"}
        for k, v in dbs[domain].items():
            if k != "name":
                reordered[k] = v
        dbs[domain] = reordered
    return dbs

VALUES_FIX = {#"fen ditton":"fenditton",
              "john's":"johns", "catherine's":"catherines",
              "the bridge guest":"bridge guest", "the rajmahal": "rajmahal", "the bedouin":"bedouin",
              "ian hong":"lan hong", "pizza express":"pizza hut",
              "express by holiday inn cambridge":"inn cambridge", "alpha-milton":"alpha-milton guest house", "el shaddai":"el shaddai guesthouse",
              "bringham new street":"birmingham new street",
              "king's lynn":"kings lynn",
              "nightclub":"night club", "concert hall":"concerthall", "guest house guest house":"guest house",
              "kettle's yard":"kettles yard",
              "3 00":"03:00", "9:30":"09:30", "2:30":"02:30", "1515hrs":"15:15", "9:15":"09:15", "109:30":"19:30", "9:45":"09:45", "7:15 p.m.":"07:15", "5:15":"05:15",
              "009:15":"09:15", "009:30":"09:30", "109:30":"19:30", "109:15":"19:15", "102:30":"12:30",
              "after ":"", " nights":"",
              "town centre":"centre",
             
              "free":"yes"}

def delexicalize(df, dbs, delex_column="preds"):
    delex_preds = []
    phone_pattern = (r"\d{5} \d{6}|\d{11}")
    postcode_pattern = (r"[a-z]{2}\d{1} \d{1}[a-z]{2}")
    reference_pattern = (r'reference number is (.*?)\.|reference number is (.*?) | \d{6}.')
    trainid_pattern = (r'TR\d{4}|tr\d{4}')
    time_pattern = (r'\d{2}:\d{2}')
    for idx, row in tqdm(df.iterrows()):
        pred = row[delex_column].lower().replace(",", "")
        domain = row["turn_domain"]
        # domain = row["turn_domain"][0][1:-1]
        if domain == "general":
            delex_preds.append(pred)
            continue
            
        for value_fix in VALUES_FIX:
            pred = pred.replace(value_fix, VALUES_FIX[value_fix])
        pred = re.sub(phone_pattern, "[value_phone]", pred)
        if re.search(postcode_pattern, pred):
            postcode = re.search(postcode_pattern, pred).group(0)
            # print(domain, pred)
            # if postcode.replace(" ", "") in dbs[domain]["postcode"]:
            pred = pred.replace(postcode, "[value_postcode]")
        if re.search(trainid_pattern, pred):
            trainid = re.search(trainid_pattern, pred).group(0)
            pred = pred.replace(trainid, "[value_id]")
        if re.search(time_pattern, pred):
            time = re.search(time_pattern, pred).group(0)
            pred = pred.replace(time, "[value_time]")
        if re.search(reference_pattern, pred):
            reference = re.search(reference_pattern, pred).group(0)
            delex_ref = " ".join(reference.split(" ")[:-1]) + " [value_reference]."
            pred = pred.replace(reference, delex_ref)
        print(domain)
        for k, values in dbs[domain].items():
            for v in values:
                if v in pred:
                    pred = pred.replace(v, f"[value_{k.lower()}]")
        delex_preds.append(pred)
    df[f"delexicalized_{delex_column}"] = delex_preds
    return df

In [71]:
e2e_results = df.copy()
e2e_results.dropna(subset=["turn_domain", "preds"], inplace=True)

In [73]:
ontology_path = data_args.mwoz_path + "ontology.json"
delex_dbs = delexicalize_dbs(data_args, ontology_path)
e2e_results = delexicalize(e2e_results, delex_dbs, delex_column="preds")
# e2e_results["delexicalized_instructod_preds"] = e2e_results["delexicalized_instructod_preds"].apply(lambda x: x.replace("value_", ""))

1036it [00:00, 5884.77it/s]

taxi
taxi
taxi
taxi
restaurant
restaurant
restaurant
restaurant
restaurant
hotel
hotel
hotel
hotel
hotel
hotel
restaurant
restaurant
restaurant
restaurant
train
train
train
train
train
restaurant
restaurant
restaurant
hotel
hotel
hotel
hotel
restaurant
hotel
hotel
hotel
hotel
restaurant
restaurant
restaurant
restaurant
restaurant
restaurant
restaurant
restaurant
taxi
taxi
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
restaurant
restaurant
restaurant
taxi
taxi
taxi
taxi
taxi
taxi
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
taxi
taxi
taxi
taxi
taxi
restaurant
restaurant
restaurant
restaurant
restaurant
train
train
train
train
train
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
taxi
taxi
taxi
taxi
attraction
attraction
attraction
attraction
restaurant
restaurant
restaurant
restaurant
restaurant
restaurant
restaurant
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
hotel
train
train
train
train
train
taxi
taxi
taxi
taxi
hotel
hot




In [74]:
def get_success(df, pred_column="delexicalized_preds"):
    import ast

    list_fails_e2e = []

    total_result = {}
    cur_request_slots = {}
    cur_inform_slots = {}

    prev_dialogue_id = df["dialogue_id"][0]
    for idx, row in df.iterrows():
        cur_dialogue_id = row["dialogue_id"]
        if cur_dialogue_id not in total_result:
            total_result[cur_dialogue_id] = {}

        if cur_dialogue_id != prev_dialogue_id:
            ##compute everything and reset variables, we are switching samples
            success = True
            if cur_request_slots:
                # print(prev_dialogue_id)
                # print(cur_request_slots)
                pass
            for k, v in cur_request_slots.items():
                if cur_request_slots:
                    if v[1] == 0:
                        success = False
            if success:
                total_result[prev_dialogue_id]["success"] = 1
            else:
                total_result[prev_dialogue_id]["success"] = 0
                list_fails_e2e.append(prev_dialogue_id)
                # print(row["turn_domain"])

            if cur_inform_slots:
                inform_score = 0
                for k, v in cur_inform_slots.items():
                    inform_score += v[1]/v[0]
                total_result[prev_dialogue_id]["inform"] = inform_score/len(cur_inform_slots)
            else:
                 total_result[prev_dialogue_id]["inform"] = 1

            cur_request_slots = {}
            cur_inform_slots = {}

        delex_pred = row[pred_column]

        #success
        if isinstance(row["gold_turn_bs"], str):
            gold_turn_bs = ast.literal_eval(row["gold_turn_bs"])
        else:
            gold_turn_bs = row["gold_turn_bs"]
        for k, v in gold_turn_bs.items():
            if "request" in k.lower():
                for slot_values in v:
                    # print(slot_values[0].lower())
                    if slot_values[0].lower() == "ticket":
                        slot = "price"
                    elif slot_values[0].lower() == "time":
                        slot = "duration"
                    elif slot_values[0].lower() == "fee":
                        slot = "entrance fee"
                    elif slot_values[0].lower() == "ref":
                        slot = "reference"
                    elif slot_values[0].lower() == "pricerange":
                        slot = "price"
                    else:
                        slot = slot_values[0].lower()
                    # request_slot = slot_values[0].lower() + "_value"
                    request_slot = "value_" + slot
                    if request_slot in cur_request_slots:
                        cur_request_slots[request_slot][0] += 1
                    else:
                        cur_request_slots[request_slot] = [1, 0]

        #match
        if isinstance(row["gold_act"], str):
            gold_act = ast.literal_eval(row["gold_act"])
        else:
            gold_act = row["gold_act"]
        for k, v in gold_act.items():
            if "inform" in k.lower():
                for slot_values in v:
                    inform_slot = slot_values[0].lower() + "_value"
                    if inform_slot in cur_inform_slots:
                        cur_inform_slots[inform_slot][0] += 1
                    else:
                        cur_inform_slots[inform_slot] = [1, 0]

                    if inform_slot in delex_pred:
                        cur_inform_slots[inform_slot][1] += 1


        for slot in cur_request_slots:
            if slot in delex_pred:
                cur_request_slots[slot][1] += 1

        prev_dialogue_id = cur_dialogue_id
    return total_result

In [59]:
def compute_success(total_result, dataset):
    total_count = 0
    total_multi = 0
    total_single = 0
    correct_multi_success = 0
    correct_single_success = 0
    correct_multi_match = 0
    correct_single_match = 0
    results = {}
    L = len(total_result)
    
    for k, v in total_result.items():
        if not v:
            continue
        if "MUL" in k:
            total_multi += 1
            correct_multi_success += v["success"]
            correct_multi_match += v["inform"]
        else:
            total_single += 1
            correct_single_success += v["success"]
            correct_single_match += v["inform"]
    results["success_total"] = (correct_single_success+correct_multi_success) / L
    results["success_single"] = correct_single_success / total_single if total_single != 0 else 0
    results["success_multi"] = correct_multi_success / total_multi if total_multi != 0 else 0
    results["match_total"] = (correct_single_match+correct_multi_match) / L
    results["match_single"] = correct_single_match / total_single if total_single != 0 else 0
    results["match_multi"] = correct_multi_match / total_multi if total_multi != 0 else 0
    
    
    per_domain_result = {"taxi":{"success_count":0,"nb_samples":0},
                         "train":{"success_count":0,"nb_samples":0},
                         "hotel":{"success_count":0,"nb_samples":0},
                         "restaurant":{"success_count":0,"nb_samples":0},
                         "attraction":{"success_count":0,"nb_samples":0}}
    for k, v in total_result.items():
        if not v:
            continue
        domain = list(dataset[dataset["dialogue_id"] == k]["turn_domain"])[0]
        per_domain_result[domain]["success_count"] += v["success"]
        per_domain_result[domain]["nb_samples"] += 1
    
    for k, v in per_domain_result.items():
        per_domain_result[k]["success"] = v['success_count']/v['nb_samples'] if v['nb_samples'] != 0 else 0
    
    results.update(per_domain_result)
    return results

In [75]:
total_result = get_success(e2e_results, pred_column="delexicalized_preds")
total_result = {k:v for k, v in total_result.items() if v}
results = compute_success(total_result, dataset)
results

{'success_total': 0.8165137614678899,
 'success_single': 0.8165137614678899,
 'success_multi': 0,
 'match_total': 0.01834862385321101,
 'match_single': 0.01834862385321101,
 'match_multi': 0,
 'taxi': {'success_count': 45,
  'nb_samples': 51,
  'success': 0.8823529411764706},
 'train': {'success_count': 21,
  'nb_samples': 33,
  'success': 0.6363636363636364},
 'hotel': {'success_count': 54,
  'nb_samples': 63,
  'success': 0.8571428571428571},
 'restaurant': {'success_count': 52,
  'nb_samples': 60,
  'success': 0.8666666666666667},
 'attraction': {'success_count': 6,
  'nb_samples': 11,
  'success': 0.5454545454545454}}

In [414]:
total_result = get_success(e2e_results, pred_column="delexicalized_instructod_preds")
results = compute_success(total_result)
results

IndexError: list index out of range

In [None]:
e2e_results

In [359]:
#multi domain - only 2 
total_result = get_success(e2e_results, pred_column="delexicalized_instructod_preds")
results = compute_success(total_result)
results

{'success_total': 0.16923076923076924,
 'success_single': 1.0,
 'success_multi': 0.15873015873015872,
 'match_total': 0.0,
 'match_single': 0.0,
 'match_multi': 0.0,
 'taxi': {'success_count': 0, 'nb_samples': 0, 'success': 0},
 'train': {'success_count': 3,
  'nb_samples': 19,
  'success': 0.15789473684210525},
 'hotel': {'success_count': 2, 'nb_samples': 8, 'success': 0.25},
 'restaurant': {'success_count': 6,
  'nb_samples': 19,
  'success': 0.3157894736842105},
 'attraction': {'success_count': 0, 'nb_samples': 18, 'success': 0.0}}

In [39]:
#multi domain - only 3
total_result = get_success(e2e_results, pred_column="delexicalized_instructod_preds")
results = compute_success(total_result)
results

{'success_total': 0.09259259259259259,
 'success_single': 0,
 'success_multi': 0.09433962264150944,
 'match_total': 0.0,
 'match_single': 0,
 'match_multi': 0.0}

In [210]:
for baseline in baselines:
    total_result = get_success(dataset, pred_column=f"preds_{baseline}")
    results = compute_success(total_result)
    print(baseline)
    # print("total", results["success_total"])
    # print("multi", results["success_multi"])
    print(results)
    print("----")

augpt
total 0.8181818181818182
multi 0.6324786324786325
----
galaxy
total 0.8152492668621701
multi 0.6068376068376068
----
pptod
total 0.8005865102639296
multi 0.5811965811965812
----
soloist
total 0.7419354838709677
multi 0.5042735042735043
----
ubar
total 0.8035190615835777
multi 0.6068376068376068
----


In [236]:
domains = ["taxi", "restaurant", "train", "attraction", "hotel"]
for baseline in baselines:
    total_result = get_success(dataset, pred_column=f"preds_{baseline}")
    results = compute_success(total_result)
    print(baseline)
    print("total", results["success_total"])
    # print("multi", results["success_multi"])
    for domain in domains:
        print(domain, results[domain]["success"])
    print("----")

augpt
total 0.9158878504672897
taxi 1.0
restaurant 0.9
train 0.9090909090909091
attraction 0.7272727272727273
hotel 0.9166666666666666
----
galaxy
total 0.9252336448598131
taxi 1.0
restaurant 0.9666666666666667
train 0.8181818181818182
attraction 0.7272727272727273
hotel 0.9333333333333333
----
pptod
total 0.9158878504672897
taxi 1.0
restaurant 0.9666666666666667
train 0.7575757575757576
attraction 0.6363636363636364
hotel 0.95
----
soloist
total 0.8691588785046729
taxi 0.8979591836734694
restaurant 0.9166666666666666
train 0.696969696969697
attraction 0.6363636363636364
hotel 0.95
----
ubar
total 0.9112149532710281
taxi 0.9795918367346939
restaurant 0.9666666666666667
train 0.7272727272727273
attraction 0.7272727272727273
hotel 0.95
----


In [108]:
for baseline in rg_baselines:
    total_result = get_success(dataset, pred_column=f"preds_{baseline}")
    results = compute_success(total_result)
    print(baseline)
    print("total", results["success_total"])
    print("multi", results["success_multi"])
    print("----")

bort
total 0.9299065420560748
multi 0
----
damd
total 0.897196261682243
multi 0
----
hdno
total 0.5934579439252337
multi 0
----
hdas
total 0.5887850467289719
multi 0
----


TypeError: argument of type 'NoneType' is not iterable

In [40]:
#single
for baseline in baselines:
    total_result = get_success(dataset, pred_column=f"preds_{baseline}")
    results = compute_success(total_result)
    print(baseline)
    print("total", results["success_total"])
    print("multi", results["success_multi"])
    print("----")

augpt
total 0.9192825112107623
multi 0
----
galaxy
total 0.9282511210762332
multi 0
----
pptod
total 0.9192825112107623
multi 0
----
soloist
total 0.8699551569506726
multi 0
----
ubar
total 0.9103139013452914
multi 0
----


## With evaluator

In [439]:
augpt = json.load(open("src/e2e/baselines/augpt.json", "r"))
galaxy = json.load(open("src/e2e/baselines/galaxy-e2e.json", "r"))
pptod = json.load(open("src/e2e/baselines/pptod.json", "r"))
soloist = json.load(open("src/e2e/baselines/soloist.json", "r"))
ubar = json.load(open("src/e2e/baselines/ubar.json", "r"))

#RG baselines
bort = json.load(open("src/RG/baselines/bort.json", "r"))
damd = json.load(open("src/RG/baselines/damd.json", "r"))
hdno = json.load(open("src/RG/baselines/hdno.json", "r"))
hdsa = json.load(open("src/RG/baselines/hdsa.json", "r"))
labes = json.load(open("src/RG/baselines/labes.json", "r"))
lava = json.load(open("src/RG/baselines/lava.json", "r"))
marco = json.load(open("src/RG/baselines/marco.json", "r"))
mintl = json.load(open("src/RG/baselines/mintl.json", "r"))
pptod = json.load(open("src/RG/baselines/pptod.json", "r"))
rstod = json.load(open("src/RG/baselines/rstod.json", "r"))
sfn = json.load(open("src/RG/baselines/sfn.json", "r"))
uniconv = json.load(open("src/RG/baselines/uniconv.json", "r"))


keep_id = list(dataset["dial_id"].unique())


augpt = {k: v for k, v in augpt.items() if k in keep_id}
galaxy = {k: v for k, v in galaxy.items() if k in keep_id}
pptod = {k: v for k, v in pptod.items() if k in keep_id}
soloist = {k: v for k, v in soloist.items() if k in keep_id}
ubar = {k: v for k, v in ubar.items() if k in keep_id}

#RG baselines
bort = {k: v for k, v in bort.items() if k in keep_id}
damd = {k: v for k, v in damd.items() if k in keep_id}
hdno = {k: v for k, v in hdno.items() if k in keep_id}
hdsa = {k: v for k, v in hdsa.items() if k in keep_id}
labes = {k: v for k, v in labes.items() if k in keep_id}
lava = {k: v for k, v in lava.items() if k in keep_id}
marco = {k: v for k, v in marco.items() if k in keep_id}
mintl = {k: v for k, v in mintl.items() if k in keep_id}
pptod = {k: v for k, v in pptod.items() if k in keep_id}
rstod = {k: v for k, v in rstod.items() if k in keep_id}
sfn = {k: v for k, v in sfn.items() if k in keep_id}
uniconv = {k: v for k, v in uniconv.items() if k in keep_id}

rg_baselines = {"bort":bort,
               "damd":damd,
               "hdno":hdno,
               "hdas":hdsa,
               "labes":labes,
               "lava":lava,
               "marco":marco,
               "mintl":mintl,
               "pptod":pptod,
               "rstod":rstod,
               "sfn":sfn,
               "uniconv":uniconv,}

In [238]:
from mwzeval.metrics import Evaluator

e = Evaluator(bleu=True, success=True, richness=False)

In [76]:
eval_results = {}
for baseline_name, baseline in rg_baselines.items():
    results = e.evaluate(baseline)
    print(baseline_name)
    print("BLEU", results["bleu"]["mwz22"])
    print("Inform", results["success"]["inform"])
    # print("Success", results["success"]["success"]["total"])
    eval_results[baseline_name] = results

NameError: name 'rg_baselines' is not defined

In [53]:
# with open("src/RG/baselines/rg_baseline_results.json", "w") as f:
#     json.dump(eval_results, f, indent=4)

In [289]:
augpt = list(dataset["preds_augpt"])

In [415]:
df = dataset.copy()

In [438]:
# 1
def get_baseline(df, baseline_name):
    baseline = {}
    for idx, row in df.iterrows():
        dial_id = row["dial_id"]
        resp = row[f"preds_{baseline_name}"]
        if dial_id in baseline:
            baseline[dial_id].append({"response":resp})
        else:
            baseline[dial_id] = []
    return baseline

augpt = get_baseline(df, "augpt")
galaxy = get_baseline(df, "galaxy")
pptod = get_baseline(df, "pptod")
soloist = get_baseline(df, "soloist")
ubar = get_baseline(df, "ubar")

augpt_ids = list(augpt.keys())

In [440]:
# 3
augpt = {k:v for k, v in augpt.items() if k in augpt_ids}
galaxy = {k:v for k, v in galaxy.items() if k in augpt_ids}
pptod = {k:v for k, v in pptod.items() if k in augpt_ids}
soloist = {k:v for k, v in soloist.items() if k in augpt_ids}
ubar = {k:v for k, v in ubar.items() if k in augpt_ids}

In [442]:
results = e.evaluate(augpt)
print(f"augpt: {results}")
print("-------")
results = e.evaluate(galaxy)
print(f"galaxy: {results}")
print("-------")
results = e.evaluate(pptod)
print(f"pptod: {results}")
print("-------")
results = e.evaluate(soloist)
print(f"soloist: {results}")
print("-------")
results = e.evaluate(ubar)
print(f"ubar: {results}")

augpt: {'bleu': {'mwz22': 14.783935143891455}, 'success': {'inform': {'attraction': 82.1, 'restaurant': 86.5, 'taxi': 100.0, 'total': 64.8, 'hotel': 72.4, 'train': 66.7}, 'success': {'attraction': 56.4, 'restaurant': 54.1, 'taxi': 60.4, 'total': 50.0, 'hotel': 62.1, 'train': 66.7}}, 'richness': {'entropy': 6.491089500508718, 'cond_entropy': 1.9977003700476377, 'avg_lengths': 13.40711462450593, 'msttr': 0.6924444444444443, 'num_unigrams': 320, 'num_bigrams': 999, 'num_trigrams': 1472}, 'dst': None}
-------
galaxy: {'bleu': {'mwz22': 17.307565082813177}, 'success': {'inform': {'attraction': 84.6, 'restaurant': 89.2, 'taxi': 100.0, 'total': 68.5, 'hotel': 69.0, 'train': 100.0}, 'success': {'attraction': 66.7, 'restaurant': 64.9, 'taxi': 62.5, 'total': 57.4, 'hotel': 58.6, 'train': 100.0}}, 'richness': {'entropy': 6.018695779886865, 'cond_entropy': 1.669661896128644, 'avg_lengths': 13.616600790513834, 'msttr': 0.6468613138686132, 'num_unigrams': 179, 'num_bigrams': 564, 'num_trigrams': 830

In [None]:
df.head(20)

In [144]:
augpt["sng0073"]

[{'response': 'Sure! when would you like to arrive?',
  'state': {'taxi': {'departure': "saint john's college",
    'destination': 'pizza hut fenditton'}},
  'active_domains': ['taxi']},
 {'response': 'Your booking is complete. A [car] will pick you up.',
  'state': {'taxi': {'departure': "saint john's college",
    'destination': 'pizza hut fenditton',
    'leave at': '17:15'}},
  'active_domains': ['taxi']},
 {'response': "You're welcome. Is there anything else I can help you with?",
  'state': {'taxi': {'departure': "saint john's college",
    'destination': 'pizza hut fenditton',
    'leave at': '17:15'}},
  'active_domains': ['taxi']},
 {'response': 'Thank you for using our services.',
  'state': {'taxi': {'departure': "saint john's college",
    'destination': 'pizza hut fenditton',
    'leave at': '17:15'}},
  'active_domains': ['taxi']}]

In [152]:
# filtered_dict = {k: [{'response': nested_dict['response']} for nested_dict in v] for k, v in augpt.items()}
filtered_dict = {k: [{'response': nested_dict['response']} for nested_dict in v] for k, v in galaxy.items()}

In [153]:
filtered_dict["sng0073"]

[{'response': 'what time would you like to leave and arrive by ?'},
 {'response': 'booking completed ! booked car type : [value_car] contact number : [value_phone]'},
 {'response': 'is there anything else i can help you with ?'},
 {'response': 'thank you for using our services .'}]

In [154]:
results = e.evaluate(filtered_dict)
print(f"BLEU: {results}")

Unknown slot name: hotel. Please use another slot names or customize the slot mapping!
Unknown slot name: hotel. Please use another slot names or customize the slot mapping!
Unknown slot name: hotel. Please use another slot names or customize the slot mapping!
BLEU: {'bleu': {'mwz22': 19.635149778483182}, 'success': {'inform': {'total': 92.0, 'taxi': 100.0, 'restaurant': 95.0, 'attraction': 98.2, 'train': 94.9, 'hotel': 93.1}, 'success': {'total': 82.1, 'taxi': 87.2, 'restaurant': 86.0, 'attraction': 80.6, 'train': 90.9, 'hotel': 86.0}}, 'richness': None, 'dst': None}


Much lower in multi because we give all the knowledge base and let the model decide which one it needs to interact with, but the only information the model has access to are the knowledge base attributes.

Fine-tuned settings know which domains are available (from training) and interact with the correct knowledge base
In InstrucTOD, (since it's zero-shot) we don't assume domain knowledge and simply let the model navigate through the knowledge bases to find the correct one. But due to attribute overlaps, this introduces additional errors.

## Compute for delexicalized e2e

In [181]:
e2e_results = pd.read_csv("src/e2e/results/instructod_results.csv")
e2e_results = e2e_results.dropna(subset=["instructod_preds"])

# e2e_results = pd.read_csv("src/e2e/results/instructod_results.csv")
# e2e_results = e2e_results.dropna(subset=["rg_preds", "turn_domain"])

# e2e_results = pd.read_csv("src/e2e/results/gpt-3.5-turbo_e2e_agents_multi_full_output.csv")
# e2e_results = e2e_results.rename(columns={"preds":"instructod_preds"})
# e2e_results = pd.merge(dataset, e2e_results[["id", "instructod_preds"]], on=["id"])
# e2e_results = e2e_results.drop_duplicates(subset=["id"])

In [64]:
# e2e_results = e2e_results_single.append(e2e_results)

  e2e_results = e2e_results_single.append(e2e_results)


In [443]:
df = df.rename(columns={"preds":"instructod_preds"})
e2e_results = df.copy()

In [444]:
delex_column = "instructod_preds"

ontology_path = data_args.mwoz_path + "ontology.json"
delex_dbs = delexicalize_dbs(data_args, ontology_path)
e2e_results = delexicalize(e2e_results, delex_dbs, delex_column=delex_column)
e2e_results[f"delexicalized_{delex_column}"] = e2e_results[f"delexicalized_{delex_column}"].apply(lambda x: x.replace("value_", ""))

500it [00:00, 4412.19it/s]


In [154]:
# delex_rg = []
# #delexicalize gold
# for idx, row in e2e_results.iterrows():
#     pred = row["rg_preds"]
#     gold_act = row["gold_act"]
#     gold_act = ast.literal_eval(gold_act)
#     mapping = {}
#     for k, v in gold_act.items():
#         if "inform" in k.lower():
#             for slot_value in v:
#                 slot = slot_value[0]
#                 value = slot_value[1]
#                 mapping[value.lower()] = slot.lower()
#     for value in mapping:
#         pred = pred.lower().replace(value, f"[{mapping[value]}]")
#     pred = pred.lower()
#     delex_rg.append(pred)

In [155]:
domain_count = [domain for domains in list(e2e_results["domains"]) for domain in domains]
for domain in ["hotel", "attraction", "taxi", "restaurant", "train"]:
    print(domain, domain_count.count(domain))

hotel 0
attraction 0
taxi 0
restaurant 0
train 0


In [70]:
for i in range(10):
    print(list(e2e_results["gold_response"])[i])
    print(list(e2e_results["instructod_preds"])[i])
    print("----")

SYSTEM: There are 21 guesthouses that have parking available. Is there a certain price range or part of the city you are interested in?  

I'm sorry, but there are currently no guesthouses available with free parking included. Would you like me to search for guesthouses with paid parking options?
----
SYSTEM: Unfortunately, I can't find any that are expensive. Let's try a different price range.

Great, I found the perfect guesthouse for you! The Acorn Guest House has free parking and is in a higher price range. Would you like me to confirm your booking?
----
SYSTEM: Do you want a cheap hotel or a cheap guesthouse?

Great, I found the Acorn Guest House located at 154 Chesterton Road, CB4 1DA. It has free parking and falls within the cheap price range. Would you like me to confirm a booking for you?
----
SYSTEM: How about Alexander Bed and Breakfast?

I found a cheap guesthouse with free wifi. Would you like me to confirm the booking for you?
----
SYSTEM: Booked. Your reference number is

In [77]:
e2e_preds = {}
delex_column = "preds"

e2e_results[f"delexicalized_{delex_column}"] = e2e_results[f"delexicalized_{delex_column}"].apply(lambda x: x.replace("value_", ""))

for idx, row in e2e_results.iterrows():
    cur_id = row["dialogue_id"].lower().split(".")[0]    
    resp = {"response": row[f"delexicalized_{delex_column}"]}
    if cur_id not in e2e_preds:
        e2e_preds[cur_id] = []
    else:
        e2e_preds[cur_id].append(resp)


In [63]:
from mwzeval.metrics import Evaluator

e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(e2e_preds)
print(f"Inform: {results['success']['inform']}")
print(f"Success: {results['success']['success']}")

Inform: {'taxi': 100.0, 'total': 92.2, 'restaurant': 98.3, 'hotel': 84.1, 'train': 84.8, 'attraction': 90.9}
Success: {'taxi': 100.0, 'total': 48.9, 'restaurant': 31.7, 'hotel': 31.7, 'train': 27.3, 'attraction': 63.6}


In [78]:
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(e2e_preds)
print(f"BLEU: {results}")

BLEU: {'bleu': {'mwz22': 2.623162407981584}, 'success': {'inform': {'taxi': 100.0, 'total': 84.5, 'restaurant': 86.7, 'hotel': 81.0, 'train': 60.6, 'attraction': 90.9}, 'success': {'taxi': 100.0, 'total': 49.8, 'restaurant': 35.0, 'hotel': 31.7, 'train': 24.2, 'attraction': 72.7}}, 'richness': {'entropy': 7.02149085683963, 'cond_entropy': 2.286556615178964, 'avg_lengths': 17.851897184822523, 'msttr': 0.7617182130584194, 'num_unigrams': 618, 'num_bigrams': 2487, 'num_trigrams': 4112}, 'dst': None}


In [66]:
#Multi only
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(e2e_preds)
print(f"BLEU: {results}")

BLEU: {'bleu': {'mwz22': 4.7695355465894895}, 'success': {'inform': {'total': 61.3, 'hotel': 77.4, 'restaurant': 82.1, 'attraction': 74.0, 'train': 86.4, 'taxi': 100.0}, 'success': {'total': 8.4, 'hotel': 17.0, 'restaurant': 14.9, 'attraction': 31.5, 'train': 43.2, 'taxi': 20.8}}, 'richness': {'entropy': 7.21454933835474, 'cond_entropy': 2.67021809782643, 'avg_lengths': 24.645856980703744, 'msttr': 0.7280184331797237, 'num_unigrams': 1149, 'num_bigrams': 4307, 'num_trigrams': 7245}, 'dst': None}


In [71]:
#All
e = Evaluator(bleu=True, success=True, richness=True)
results = e.evaluate(e2e_preds)
print(f"BLEU: {results}")

Unknown slot name: insert phone number. Please use another slot names or customize the slot mapping!
Unknown slot name: insert address. Please use another slot names or customize the slot mapping!
Unknown slot name: insert postcode. Please use another slot names or customize the slot mapping!
BLEU: {'bleu': {'mwz22': 4.386530260899267}, 'success': {'inform': {'total': 80.2, 'taxi': 100.0, 'restaurant': 86.6, 'hotel': 81.4, 'train': 88.3, 'attraction': 73.8}, 'success': {'total': 18.0, 'taxi': 16.3, 'restaurant': 15.0, 'hotel': 22.1, 'train': 42.9, 'attraction': 33.3}}, 'richness': {'entropy': 7.182286639286662, 'cond_entropy': 2.761211032021644, 'avg_lengths': 24.197848176927675, 'msttr': 0.7241779975278119, 'num_unigrams': 1374, 'num_bigrams': 5907, 'num_trigrams': 10849}, 'dst': None}


In [72]:
e2e_preds["pmul0267"]

[{'response': "how about the golden dragon restaurant in the heart of cambridge? it's known for its high-end FOOD cuisine. would you like me to make a reservation for you?",
  'state': {'restaurant': {'food': 'chinese', 'pricerange': 'expensive'}},
  'active_domains': ['restaurant']},
 {'response': 'sure based on your preferences i recommend the golden dragon restaurant located at 25 high street cambridge POST. their phone number is 01223 364 333. would you like me to confirm a booking for you?',
  'state': {'restaurant': {'area': 'centre',
    'food': 'chinese',
    'pricerange': 'expensive'}},
  'active_domains': ['restaurant']},
 {'response': 'sure i can confirm a booking for you at NAME. the address is 123 main street postcode 12345. the phone number is 555-555-5555. would you like me to proceed with the booking?',
  'state': {'restaurant': {'area': 'centre',
    'food': 'chinese',
    'pricerange': 'expensive'}},
  'active_domains': ['restaurant']},
 {'response': "sure i can help 

In [386]:
e2e_results

Unnamed: 0,id,dialogue_id,dialogue_context,turn,prompt_dst,prompt_dst_update,prompt_rg,prompt_e2e,domains,turn_domain,gold_turn_bs,gold_bs,gold_act,gold_response,gold_database_result,domain_length,domain_count,instructod_preds,dial_id,preds_augpt,preds_galaxy,preds_pptod,preds_soloist,preds_ubar,delexicalized_instructod_preds
0,341,SNG0991.json,USER: I want to find a guesthouse with free parking included. Can you help?\n,0,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: home from home, address: 7...","[hotel, restaurant]",hotel,"{'Hotel-Inform': [['Type', 'guesthouse'], ['Parking', 'yes']]}","{'hotel-parking': 'yes', 'hotel-type': 'guesthouse'}","{'Hotel-Request': [['Area', '?'], ['Price', '?']], 'Hotel-Inform': [['Choice', '21'], ['Parking', 'none'], ['Type', 'guesthouses']]}",SYSTEM: There are 21 guesthouses that have parking available. Is there a certain price range or part of the city you are interested in? \n,,2,2,"I'm sorry, but there are currently no guesthouses available with free parking included. Would you like me to search for guesthouses with paid parking options?",sng0991,there are 21 [value_type] with free parking. do you have a price range or area of town you prefer?,there are [value_choice] [value_type] with free parking . do you have a price range or area you would like to stay in ?,there are [value_choice] [value_type] with free parking . do you have a price range or area preference ?,"sure , there are [value_count] guesthouses with free parking . do you have a price range or area you want to stay in ?","i have [value_choice] [value_type] with free parking, do you have a price range or area you would like to stay in?",i'm sorry but there are currently [internet] [type]s available with [internet] parking included. would you like me to search for [type]s with paid parking options?
2,342,SNG0991.json,USER: I would really like something expensive. \n,1,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\naddress: Crowne Plaza Hotel 20 D...","[hotel, restaurant]",restaurant,"{'Hotel-Inform': [['Price', 'expensive']]}","{'hotel-parking': 'yes', 'hotel-pricerange': 'expensive', 'hotel-type': 'guesthouse'}","{'Restaurant-Request': [['Price', '?']], 'Restaurant-NoOffer': [['Price', 'expensive']]}","SYSTEM: Unfortunately, I can't find any that are expensive. Let's try a different price range.\n",,2,2,"Great, I found the perfect guesthouse for you! The Acorn Guest House has free parking and is in a higher price range. Would you like me to confirm your booking?",sng0991,"i'm sorry, but i don't have anything that matches your criteria. would you like to try a [value_price range] [value_type]?","i am sorry , but there are no [value_price] [value_type] .","i am sorry , but i am not finding anything that meets your criteria . would you like to try a different price range or a [value_type] ?","i have [value_count] guesthouses that fit that criteria . [value_count] in the [value_area] , [value_count] in the [value_area] , and [value_count] in the [value_area] . which area would you like to stay in ?",how about [value_name] in the [value_area] ? it is [value_price] and has excellent reviews.,great i found the perfect guesthouse for you! the acorn guest house has yes parking and is in a higher price range. would you like me to confirm your booking?
4,343,SNG0991.json,"USER: Alright, let's try for a cheap hotel.\n",2,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: home from home, address: 7...","[hotel, restaurant]",hotel,"{'Hotel-Inform': [['Price', 'cheap']]}","{'hotel-parking': 'yes', 'hotel-pricerange': 'cheap', 'hotel-type': 'hotel'}","{'Hotel-Select': [['Price', 'cheap'], ['Price', 'cheap'], ['Type', 'hotel'], ['Type', 'guesthouse']]}",SYSTEM: Do you want a cheap hotel or a cheap guesthouse?\n,,2,2,"Great, I found the Acorn Guest House located at 154 Chesterton Road, CB4 1DA. It has free parking and falls within the cheap price range. Would you like me to confirm a booking for you?",sng0991,i have 9 [value_price range] [value_type] available. do you have a preference for the area?,[value_name] meets your criteria . would you like me to book you a room ?,there are [value_choice] [value_price] [value_type] with free parking . do you have a preference for the area ?,i have [value_count] [value_pricerange] hotel -s. [value_count] in the [value_area] and [value_count] in the [value_area] . which area do you prefer ?,what area would you like to stay in?,great i found the [name] located at [address] [postcode]. it has [internet] parking and falls within the [pricerange] price range. would you like me to confirm a booking for you?
6,344,SNG0991.json,USER: I meant a cheap guesthouse with free wifi.\n,3,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: home from home, address: 7...","[hotel, restaurant]",hotel,"{'Hotel-Inform': [['Type', 'guesthouse'], ['Price', 'cheap'], ['Internet', 'yes']]}","{'hotel-parking': 'yes', 'hotel-pricerange': 'cheap', 'hotel-internet': 'yes', 'hotel-type': 'guesthouse'}","{'Hotel-Recommend': [['Name', 'Alexander Bed and Breakfast']]}",SYSTEM: How about Alexander Bed and Breakfast?\n,,2,2,I found a cheap guesthouse with free wifi. Would you like me to confirm the booking for you?,sng0991,i have 9 [value_price range] [value_type] with free wifi. do you have a preference for the area of [value_type] or [value_type]?,[value_name] meets your criteria . would you like me to book it for you ?,i have [value_choice] [value_type] that meet your criteria . do you have a preference for the area of town you would like to stay in ?,i have [value_count] guesthouses that meet your criteria . [value_count] is in the [value_area] and [value_count] is in the [value_area] . do you have a preference ?,i have [value_choice] [value_type] that meet your needs. i would recommend [value_name] with [value_stars] stars.,i found a [pricerange] [type] with [internet] wifi. would you like me to confirm the booking for you?
8,345,SNG0991.json,"USER: Sounds good, let's book it! 8 people for 4 nights, starting Friday.\n",4,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: home from home, address: 7...","[hotel, restaurant]",hotel,"{'Hotel-Inform': [['Stay', '4'], ['Day', 'friday'], ['People', '8']]}","{'hotel-stay': '4', 'hotel-day': 'friday', 'hotel-people': '8', 'hotel-name': 'alexander bed and breakfast', 'hotel-parking': 'yes', 'hotel-pricerange': 'cheap', 'hotel-internet': 'yes', 'hotel-type': 'guesthouse'}","{'general-reqmore': [['none', 'none']], 'Booking-Book': [['Ref', '2V7EJVA7']]}",SYSTEM: Booked. Your reference number is 2V7EJVA7. Is there anything else you need?\n,,2,2,Great! I can confirm your booking for Alexander Bed and Breakfast for 8 people starting Friday for 4 nights. Would you like me to proceed with the booking?,sng0991,"i'm sorry, but the alexander bed and breakfast isn't available for your requested stay. would you like to try a different day or shorter stay?",booking was successful . reference number is : [value_reference] . is there anything else i can do for you ?,i have made those reservations and your reference number is [value_reference] .,i have your party of [value_count] booked at the [value_name] for [value_count] nights starting [value_day] . your reference number is [value_reference] . can i help you with anything else ?,booking was successful. reference number is : [value_reference] . is there anything else i can help you with?,great! i can confirm your booking for [name] for [people] starting [day] for [stars]. would you like me to proceed with the booking?
10,346,SNG0991.json,"USER: Nope, that'll do it. Thank you so much for your help!\n",5,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nhotel-day\nhotel-people\nhotel-stay\nhotel-area\nhotel-internet\nhotel-name\nhotel-parking\nhotel-pricerange\nhotel-stars\nhotel-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-a...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: home from home, address: 7...","[hotel, restaurant]",hotel,"{'general-thank': [['none', 'none']]}","{'hotel-stay': '4', 'hotel-day': 'friday', 'hotel-people': '8', 'hotel-name': 'alexander bed and breakfast', 'hotel-parking': 'yes', 'hotel-pricerange': 'cheap', 'hotel-internet': 'yes', 'hotel-type': 'guesthouse'}","{'general-bye': [['none', 'none']]}",SYSTEM: You're welcome. Hope you have a pleasant stay.\n,,2,2,"You're welcome! If you need any further assistance, don't hesitate to reach out. Have a great day!",sng0991,you are welcome. enjoy your stay.,you are welcome . have a great day !,you are welcome . have a great day !,you are welcome . is there anything else i can help you with ?,thank you for using our services.,you're welcome! if you need any further assistance don't hesitate to reach out. have a great day!
12,4,PMUL4648.json,USER: Please find a restaurant called Nusha.\n,0,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\naddress: Crowne Plaza Hotel 20 D...","[restaurant, attraction]",restaurant,"{'Restaurant-Inform': [['none', 'none']], 'Attraction-Inform': [['Name', 'Nusha']]}",{},"{'Restaurant-Request': [['Food', '?']], 'Restaurant-NoOffer': [['Name', 'Nusha']]}",SYSTEM: I don't seem to be finding anything called Nusha. What type of food does the restaurant serve?\n,,2,2,"Sure, I can help you with that. Would you like me to make a reservation for you at Nusha?",pmul4648,[value_name] is an [value_price range] [value_food] restaurant in the [value_area]. would you like me to book you a table?,[value_name] is an [value_price] [value_food] restaurant in the [value_area] . would you like me to make a reservation for you ?,[value_name] is a [value_food] restaurant in the [value_area] . would you like me to book a table for you ?,[value_name] is a [value_food] restaurant in the [value_area] of town . it is [value_pricerange] .,[value_name] is a restaurant. the phone number is [value_phone] .,sure i can help you with that. would you like me to make a reservation for you at nusha?
13,5,PMUL4648.json,USER: I am not sure of the type of food but could you please check again and see if you can find it? Thank you.\n,1,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\naddress: Crowne Plaza Hotel 20 D...","[restaurant, attraction]",restaurant,"{'general-thank': [['none', 'none']]}",{},"{'Restaurant-Request': [['Name', '?']], 'Restaurant-Inform': [['Name', 'Nandos']]}",SYSTEM: Could you double check that you've spelled the name correctly? The closest I can find is Nandos.\n,,2,2,"I apologize for the inconvenience. I will search again for Nusha. If I am unable to find it, would you like me to suggest some similar restaurants in the area?",pmul4648,"i'm sorry, nusha is a restaurant serving [value_food] food. would you like to book a table there?",it is a [value_food] restaurant .,[value_name] is a [value_food] restaurant in the [value_area] .,[value_name] is a [value_food] restaurant in the [value_area] . it is [value_pricerange] . would you like to book a table ?,[value_name] serves [value_food] food,i apologize for the inconvenience. i will search again for nusha. if i am unable to find it would you like me to suggest some similar restaurants in the area?
14,6,PMUL4648.json,"USER: It's not a restaurant, it's an attraction. Nusha.\n",2,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: great saint mary's church,...","[restaurant, attraction]",attraction,"{'Attraction-Inform': [['Name', 'nusha']]}",{'attraction-name': 'nusha'},"{'general-reqmore': [['none', 'none']], 'Attraction-Inform': [['Type', 'entertainment'], ['Addr', 'unit g6'], ['Addr', 'cambridge leisure park'], ['Addr', 'clifton road'], ['Area', 'south']]}","SYSTEM: oh its okay. that is an entertainment type located in the south at unit g6, cambridge leisure park, clifton road. do you need their phone number?\n",,2,2,Thank you for clarifying. I have found the attraction called Nusha and it is classified as entertainment. Is there anything else I can help you with?,pmul4648,[value_name] is a [value_type] in the [value_area] of town. their address is [value_address].,[value_name] is [value_price] to enter .,[value_name] is a [value_type] in the [value_area] . would you like their phone number ?,"ok , i have found it . it is a nightclub in the [value_area] . it is located at [value_count] [value_address] . can i help you with anything else ?",is there anything else i can help you with?,thank you for clarifying. i have found the attraction called [name] and it is classified as [type]. is there anything else i can help you with?
15,7,PMUL4648.json,"USER: No, but please confirm their address again and their postcode. \n",3,"Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","Generate the dialogue state of the following dialogue between a USER and a task-oriented dialogue SYSTEM. The results should be in a single python dictionary following this format: {""domain1-slot1"":value1, ""domain2-slot2"":""value2""}. Use the provided domain and slots, and nothing else:\n\nSLOTS:\nattraction-area\nattraction-name\nattraction-type\nrestaurant-day\nrestaurant-people\nrestaurant-time\nrestaurant-area\nrestaurant-food\nrestaurant-name\nrestaurant-pricerange\n\nYou can follow this ...","In a task oriented dialogue setting, generate a SYSTEM response to the USER query in the conversation provided in CONTEXT. You should follow the information provided in ACT to generate this answer. Do not answer with anything other than what is provided in the dialogue act:\n\nYou can follow this example:\nUSER: How much does the banana cost?\nACT: Inform the user that the price is 10$, the promotion is 80%, and the choice is 5, and request the amount that user wants.\nSYSTEM: There are 5 to...","Generate the answer of the SYSTEM in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You can either request more details to the user that is available in the knowledge base to complete the goal, or simply answer the user's request. Do not provide multiple choice for the user to choose, just recommend one, and generate nothing other that the SYSTEM reply. Use the following knowledge base to interact with the user:\n\nname: great saint mary's church,...","[restaurant, attraction]",attraction,"{'Attraction-Request': [['Addr', '?'], ['Post', '?']]}",{'attraction-name': 'nusha'},"{'general-reqmore': [['none', 'none']], 'Attraction-Inform': [['Post', 'CB17DY'], ['Addr', 'Unit g6'], ['Addr', 'Cambridge Leisure Park'], ['Addr', 'Clifton road .']]}","SYSTEM: Their address is Unit g6, Cambridge Leisure Park, Clifton road. The postcode is CB17DY. Is their anything else I can do for you?\n",,2,2,"Sure, the address for Nusha is Unit G6, Cambridge Leisure Park, Clifton Road, and the postcode is CB1 7DY. Is there anything else I can assist you with?",pmul4648,"[value_name] is located at [value_address], postcode [value_postcode]. is there anything else i can help you with?",the address is [value_address] and the postcode is [value_postcode] . is there anything else i can help you with ?,"[value_name] is located at [value_address] , postcode [value_postcode] . can i help you with anything else today ?","sure , it s located at [value_address] . postcode [value_postcode] .",the address is [value_address] and the postcode is [value_postcode] .,sure the address for [name] is unit g6 cambridge leisure [type] clifton road and the postcode is [postcode]. is there anything else i can assist you with?


In [369]:
L_augpt = [resp["response"] for k, v in augpt.items() for resp in v]

In [370]:
L_augpt

['NAME is an PRICE FOOD restaurant in the AREA. would you like me to book you a table?',
 "i'm sorry, nusha is a restaurant serving FOOD food. would you like to book a table there?",
 'NAME is a TYPE in the AREA of town. their address is ADDRESS.',
 'NAME is located at ADDRESS, postcode POST. is there anything else i can help you with?',
 "i'm sorry, there aren't any FOOD restaurants in the AREA. would you like to try a different type of food?",
 'i have several. do you have a preference for area?',
 'the address for NAME is ADDRESS.',
 'NAME serves FOOD food.',
 'thank you for using our system today!',
 'thank you for using our system today!',
 'there are several PRICE restaurants. do you have a preference for area or type of food?',
 'there are 21 restaurants in the AREA of town that are PRICE. do you have a preference for type of food?',
 'NAME serves FOOD food and the postcode is POST.',
 'NAME serves FOOD food and the postcode is POST.',
 "i'm sorry, there are no TYPE in the AREA 

In [364]:
L = list(e2e_results["delexicalized_instructod_preds"])
for i in range(len(L)):
    print(L[i])
    print("---")

i'm sorry but there are currently [internet] [type]s available with [internet] parking included. would you like me to search for [type]s with paid parking options?
---
great i found the perfect guesthouse for you! the acorn guest house has yes parking and is in a higher price range. would you like me to confirm your booking?
---
great i found the [name] located at [address] [postcode]. it has [internet] parking and falls within the [pricerange] price range. would you like me to confirm a booking for you?
---
i found a [pricerange] [type] with [internet] wifi. would you like me to confirm the booking for you?
---
great! i can confirm your booking for [name] for [people] starting [day] for [stars]. would you like me to proceed with the booking?
---
you're welcome! if you need any further assistance don't hesitate to reach out. have a great day!
---
sure i can help you with that. would you like me to make a reservation for you at nusha?
---
i apologize for the inconvenience. i will search