In [3]:
import os
import json
import logging
import pandas as pd
import numpy as np

from tqdm import tqdm
from langchain import PromptTemplate

from src.DST.config import CONFIG
# from src.DST.config import CONFIG


import os

# CONFIG = {
#     "openai_api_key": os.environ["OPENAI_API_KEY"], #Put your own there
#     "openai_organization": os.environ["OPENAI_API_ORG"], #Put your own there (needed afaik if using gpt4)
#     #NEED TO RENAME THESE TO SLOT FREE DST (no need to even know slot name)
#     "INSTRUCTION_PROMPTS":["""Using the following context, generate the belief states of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting in a json format:\n""",
#                            """You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following knowledge base as grounding for acts, slots and values, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with 'domain', 'act' and 'belief_state' as information:\n""",
#                            """You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following knowledge base as grounding for acts, slots and values, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with 'domain', 'act' and 'belief_state' as primary keys, for example {'domain':domain, 'act':act, 'belief_state':{first slot: first value, second slot: second value, etc...}:\n""",
#                            """You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following SLOTS provided, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with the slot name as the primary key, and the retrieved value associated to the slot, for example {'slot1':'value1', 'slot2':'value2', etc...}:\n"""],
#     "CORRECTION_INSTRUCTION_PROMPTS": ["""You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following knowledge base as grounding for acts, slots and values, another faulty system already generated the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. If you think the belief states are inaccurate, you should generate new correct belief state. The dialogue states should only be related to the information the USER is looking for, and not what the SYSTEM should give. If you think the given belief states are accurate, then just say 'correct'. The newly generated results should follow the same json format with 'domain', 'act' and 'belief_state' as information:\n""",
#                                        """You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following knowledge base as grounding for acts, slots and values, another faulty system already generated the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You should generate the new and correct belief state. The belief state should only be related to the information the USER is looking for, and not what the SYSTEM needs to give. The newly generated results should follow the same json format with 'domain', 'act' and 'belief_state' as primary keys:\n"""],
#     "SLOTS_GENERATION_INSTRUCTION_PROMPTS": [""""""],
#     "PROMPT_TEMPLATE":{"template": """{instruction}\n\nKnowledge base:\n{ontology}\n\nContext:\n{dialogue}""",
#                        "input_variables": ["instruction", "ontology", "dialogue"]},
#     "PROMPT_TEMPLATE_WITHOUT_ONTOLOGY":{"template": """{instruction}\n\nContext:\n{dialogue}""",
#                                         "input_variables": ["instruction", "dialogue"]},
#     "CORRECTION_PROMPT_TEMPLATE":{"template": """{instruction}\n\nKnowledge base:\n{ontology}\n\nContext:\n{dialogue}\n\nPredicted Belief State:\n{belief_state}""",
#                                   "input_variables": ["instruction", "ontology", "dialogue", "belief_state"]},
#     "CORRECTION_PROMPT_TEMPLATE_WITHOUT_ONTOLOGY":{"template": """{instruction}\n\nContext:\n{dialogue}\n\nPredicted Belief State:\n{belief_state}""",
#                                                    "input_variables": ["instruction", "dialogue", "belief_state"]},
#     "PROMPT_TEMPLATE_SLOTS_GENERATION":{"template": """Complete the following python dictionnary named SLOTS of the most general slots that a USER can query in a task-oriented dialogue setting for belief state tracking in the following domain: {domains_string}. The dictionnary should hav for primary keys the possible domains, for example "SLOTS = {{"domain1":[slot1, slot2, etc...], "domain2":[slot1, slot2, etc...]}}\n\nSLOTS = {{""",
#                                         "input_variables": ["domains_string"]}, #Generates dictionnary SLOTS = {domain:[list of slots]}
# }


class PromptConstructor():
    def __init__(self, 
                 config, 
                 domains, 
                 path_to_db, 
                 with_ontology, 
                 max_values_per_slot):

        if domains == "all":
            self.domains = ["taxi", "restaurant", "hotel", "train", "attraction"]
        else:
            self.domains = domains
        self.prompt_config = {domain:{"instruction":"",
                                      "context_prompt":""} for domain in self.domains}

        self.config = config
        self.PROMPT = ""
        for domain in self.domains:
            df = self.get_df_from_domain(domain, path_to_db)
            extracted_ontology = self.extract_slot_from_kb(df, max_values_per_slot)
            context_prompt = self.construct_context_prompt_from_ontology(extracted_ontology, domain, mode=1)
            self.prompt_config[domain]["instruction"] = config["INSTRUCTION_PROMPTS"][2]
            self.prompt_config[domain]["context_prompt"] = context_prompt

        if with_ontology:
            self.prompt_template = PromptTemplate(
                input_variables=config["PROMPT_TEMPLATE"]["input_variables"],
                template = config["PROMPT_TEMPLATE"]["template"]
            )
            self.correct_prompt_template = PromptTemplate(
                input_variables=config["CORRECTION_PROMPT_TEMPLATE"]["input_variables"],
                template = config["CORRECTION_PROMPT_TEMPLATE"]["template"]
            )
        else:
            self.prompt_template = PromptTemplate(
                input_variables=config["PROMPT_TEMPLATE_WITHOUT_ONTOLOGY"]["input_variables"],
                template = config["PROMPT_TEMPLATE_WITHOUT_ONTOLOGY"]["template"]
            )
            self.correct_prompt_template = PromptTemplate(
                input_variables=config["CORRECTION_PROMPT_TEMPLATE_WITHOUT_ONTOLOGY"]["input_variables"],
                template = config["CORRECTION_PROMPT_TEMPLATE_WITHOUT_ONTOLOGY"]["template"]
            )

    def get_df_from_domain(self, domain, path_to_db):
        json_path = os.path.join(path_to_db, f"{domain}_db.json")
        with open(json_path, 'r') as f:
            db = json.load(f)
        df = pd.DataFrame(db)
            
        return df

    def extract_slot_from_kb(self, df, max_values_per_slot):
        column_names = df.columns
        extracted_ontology = {}
        for column_name in column_names:
            exceeded_max_values_per_slot = False
            temp_memory = []
            for value in df[column_name]:
                str_value = str(value)
                if str_value not in temp_memory:
                    temp_memory.append(str_value)
                    if len(temp_memory) >= max_values_per_slot:
                        possible_values = ["anything"]
                        exceeded_max_values_per_slot = True
                        break
            if not exceeded_max_values_per_slot:
                possible_values = temp_memory
            extracted_ontology[column_name] = possible_values
            
        return extracted_ontology

    def _construct_acts_from_domain(self, domain):
        dialog_acts = {
        'restaurant': ['inform', 'request'], #'nooffer', 'recommend', 'select', 'offerbook', 'offerbooked', 'nobook'],
        'hotel': ['inform', 'request'], #'nooffer', 'recommend', 'select', 'offerbook', 'offerbooked', 'nobook'],
        'attraction': ['inform', 'request'], #'nooffer', 'recommend', 'select'],
        'train': ['inform', 'request'], #'nooffer', 'offerbook', 'offerbooked', 'select'],
        'taxi': ['inform', 'request']}
        prompt_acts = f"domain: general, possible acts: greet, bye, thanks\n"
        prompt_acts += f"domain: {domain}, possible acts: "
        for act in dialog_acts[domain]:
            prompt_acts += act + ", "
        return prompt_acts[:-2]

    def construct_context_prompt_from_ontology(self, ontology_json, domain, mode):
        """
        mode=0: key:value
        mode=1: 'slot':key, 'possible values':value
        """
        acts = self._construct_acts_from_domain(domain)
        context_prompt = f"""{acts}\n"""
        ontology_json = self._fix_values(ontology_json, domain)
        for key, values in ontology_json.items():
            if mode == 0:
                context_prompt += key + ": "
                for idx, value in enumerate(values):
                    if value:
                        context_prompt += value + ", "
                context_prompt = context_prompt[:-2] + "\n"

            elif mode == 1:
                context_prompt += "slot:" + key + ", possible values: "
                for idx, value in enumerate(values):
                    if value:
                        context_prompt += value + ", "
                context_prompt = context_prompt[:-2] + "\n"
        
        return context_prompt

    def _fix_values(self, ontology_json, domain):
        if domain=="restaurant":
            ontology_json.pop("signature")
            ontology_json.pop("type")
        elif domain=="hotel":
            ontology_json.pop("location")
        elif domain=="taxi":
            ontology_json["taxi_phone"] = ["anything"]
        elif domain=="train":
            ontology_json["price"] = ["anything"]
            ontology_json["duration"] = ["anything"]
        elif domain=="attraction":
            ontology_json["entrance fee"] = ["anything"]
        return ontology_json



class MWOZ_DST(PromptConstructor):
    def __init__(self, 
                 config=CONFIG,
                 path_to_db="/home/willy/InstrucTOD/MultiWOZ_2.1/",
                 with_ontology=True,
                 max_values_per_slot=25,
                 mwoz_path="MultiWOZ_2.1/", 
                 dialog_history_limit=0, 
                 single_domain_only=True,
                 domains="all",
                 debug=True):
        PromptConstructor.__init__(self, 
                                   config, 
                                   domains, 
                                   path_to_db, 
                                   with_ontology, 
                                   max_values_per_slot)
        # self.PROMPT = "test"
        if domains == "all":
            self.domains = ["taxi", "restaurant", "hotel", "train", "attraction"]
        else:
            self.domains = domains

        self.single_domain_data = {"utterances":[],
                                      "dialog_acts":[],
                                      "domains":[],
                                      "ids":[],
                                      "speakers":[]}
        
        self.multi_domain_data = {"utterances":[],
                                     "dialog_acts":[],
                                     "domains":[],
                                     "ids":[],
                                     "speakers":[]}


        data_path = os.path.join(mwoz_path, "data.json")
        testListFile_path = os.path.join(mwoz_path, "testListFile.txt")

        with open(data_path, "r") as f:
            self.all_data = json.load(f)
            
        with open(testListFile_path, "r") as f:
            testfiles = f.read()
        testfiles = testfiles.split("\n")

        print("Processing mwoz...")
        for sample in tqdm(self.all_data):
            if sample in testfiles:
                if "MUL" not in sample:
                    self._get_acts_and_utterances_from_dialogue_log(data=self.single_domain_data,
                                                                    sample_id=sample)
                else:
                    self._get_acts_and_utterances_from_dialogue_log(data=self.multi_domain_data,
                                                                    sample_id=sample)
        
        if single_domain_only:
            self.dataset = self._build_dataset(data=self.single_domain_data,
                                dialog_history_limit=dialog_history_limit)

        else:
            self.dataset = self._build_dataset(data=self.multi_domain_data,
                                dialog_history_limit=dialog_history_limit)

        self.dataset = pd.DataFrame(self.dataset)

        if debug:
            for i in range(500):
                print(self.dataset["prompts"][i])
                print(self.dataset["golds"][i])
                print(self.dataset["ids"][i])
                print(self.dataset["domains"][i])
                print("=======")
            print(len(self.dataset["prompts"]))
            print(len(self.dataset["golds"]))
            print(len(self.dataset["ids"]))
            print(len(self.dataset["domains"]))
            # for i in range(10):
            #     if self.single_domain_data["speakers"][2*i] == "user":
            #         print(self.dataset[i])
            #         print(self.single_domain_data["utterances"][2*i])     
            #         print(self.single_domain_data["ids"][2*i])
            #         print(self.single_domain_data["domains"][2*i])
            #         print("=======")
            # print(len(self.dataset))
            # print(len(self.single_domain_data["utterances"]))
            # print(len(self.single_domain_data["speakers"]))
            # print(len(self.single_domain_data["golds"]))
            # print(len(self.single_domain_data["dialog_acts"]))
            # print(len(self.single_domain_data["ids"]))
            # print(len(self.single_domain_data["domains"]))
            # for i in range(10):
            #     if self.multi_domain_data["speakers"][2*i] == "user":
            #         print(self.dataset[i])
            #         print(self.multi_domain_data["utterances"][2*i])     
            #         print(self.multi_domain_data["ids"][2*i])
            #         print(self.multi_domain_data["domains"][2*i])
            #         print("=======")
            # print(len(self.dataset))
            # print(len(self.multi_domain_data["utterances"]))
            # print(len(self.multi_domain_data["speakers"]))
            # print(len(self.multi_domain_data["golds"]))
            # print(len(self.multi_domain_data["dialog_acts"]))
            # print(len(self.multi_domain_data["ids"]))
            # print(len(self.multi_domain_data["domains"]))

    def _get_acts_and_utterances_from_dialogue_log(self, data, sample_id):

        dialog_acts_data = data["dialog_acts"]
        utterances_data = data["utterances"]
        domains_data = data["domains"]
        ids_data = data["ids"]
        speakers_data = data["speakers"]
        dialogue_log = self.all_data[sample_id]["log"]
        dialogue_goal = self.all_data[sample_id]["goal"]
        domains = []

        for domain in self.domains:
            if dialogue_goal[domain]:
                domains.append(domain)
        
        for idx, turn in enumerate(dialogue_log):
            ids_data.append(sample_id)
            domains_data.append(domains)
            dialog_acts_data.append(turn["dialog_act"])
            if idx % 2 == 1:
                utterances_data.append("SYSTEM: " + turn["text"] +"\n")
                speakers_data.append("system")
            else:
                utterances_data.append("USER: " + turn["text"] +"\n")
                speakers_data.append("user")
                
    def _build_dataset(self, data, dialog_history_limit):

        dataset = {"prompts":[],
                    "golds":[],
                    "domains":[],
                    "ids":[]}

        dialog_history_memory = []
        dialog_history = ""
        dialog_acts = data["dialog_acts"]
        utterances = data["utterances"]
        ids = data["ids"]
        domains = data["domains"]
        current_id = ids[0]
        L = len(dialog_acts)

        for idx in tqdm(range(L)):
            new_id = ids[idx]
            dialog_act = dialog_acts[idx]
            utterance = utterances[idx]
            domain = domains[idx]

            speaker = utterance.split(":")[0]
            if speaker == "USER":
                if current_id != new_id:
                    #flush dialogue history when we change dialogue sample
                    dialog_history_memory = []
                    dialog_history = ""
                    current_id = new_id
                instruction_prompt, context_prompt = self._get_instruction_and_context_prompts(domain)
                cur_PROMPT = self.prompt_template.format(
                    instruction=instruction_prompt,
                    ontology=context_prompt,
                    dialogue=dialog_history+utterance
                )
                # cur_PROMPT = instruction_prompt + context_prompt + "Context:\n" + dialog_history + utterance
                dataset["prompts"].append(cur_PROMPT)
                dataset["golds"].append(dialog_act)
                dataset["ids"].append(current_id)
                dataset["domains"].append(domain)
                
            if dialog_history_limit != 0:
                if dialog_history_limit == -1:
                    dialog_history_limit = len(utterance)

                if len(dialog_history_memory) >= dialog_history_limit:
                    dialog_history_memory.pop(0)
                dialog_history_memory.append(utterance)
                dialog_history = ""
                for dialog_history_utterance in dialog_history_memory:
                    dialog_history += dialog_history_utterance

        return dataset
    
    def _get_instruction_and_context_prompts(self, domains):
        context_prompt = ""
        instruction_prompt = self.prompt_config[domains[0]]["instruction"] #Need to change if we change instruction per domain
        for domain in domains:
            context_prompt += self.prompt_config[domain]["context_prompt"]
        return instruction_prompt, context_prompt


def fix_typos(pred):
    pred = pred.replace("'", '"')
    pred = pred.replace('Catherine"s', "Catherine's")
    return pred

def unpack_belief_state(belief_state):
    domain = belief_state["domain"]
    act = belief_state["act"]
    bs = belief_state["belief_state"]
    slot_values = ""
    if bs:
        for k, v in bs.items():
            if v:
                v = str(v)
                slot_values += k + "-" + v + ", "
            else:
                slot_values += k + "-None, "
    else:
        slot_values += "None-None, "
    return domain.lower(), act.lower(), slot_values[:-2].lower() 

def process_preds_dst(results_df):
    preds = results_df["preds"]
    all_preds = []
    for pred in preds:
        pred = fix_typos(pred)
        belief_state = json.loads(pred)
        domain, act, slot_values = unpack_belief_state(belief_state)
        all_preds.append(domain+"-"+act+"||"+slot_values)
    return all_preds


def generate(model, prompt):
    if model in ["gpt-3.5-turbo", "gpt-4"]:
        completion = openai.ChatCompletion.create(
          model=model, 
          messages=[{"role": "system", "content": "You are a helpful assistant that interact as a Task-Oriented Dialogue System that is especially knowledgeable in doing dialogue state tracking"},
                    {"role": "user", "content": prompt}
                    ],
         temperature=0,
        )
        return completion["choices"][0]["message"]["content"], completion
    else:
        return "Only gpt-3.5 and gpt-4 currently"

def predict(dataset, model, save_path, nb_samples, save_every):
    dataset = dataset[:nb_samples]
    prompts = dataset["prompts"]
    golds = dataset["golds"]
    domains = dataset["domains"]
    ids = dataset["ids"]
    outputs = {"prompt":prompts, "preds":[], "golds":golds, "domains":domains, "dialogue_ids":ids, "completion_info":[], "model_used":[]}
    model_used = []
    preds = []
    completion_info = []
    temp_prompts = []
    
    for idx, prompt in tqdm(enumerate(prompts)):
        pred, completion = generate(model, prompt)
        model_used.append(model)
        preds.append(pred)
        completion_info.append(completion)
        #TO DELETE LATER
        temp_prompts.append(prompt)
        if idx % save_every == 0:
            temp_save_path = save_path[:-4] + "_latestSave.csv"
            temp_df = pd.DataFrame({"prompts":temp_prompts,
                                "preds":preds,
                                "completion_info":completion_info})
            temp_df.to_csv(temp_save_path)
    
    outputs["model_used"] = model_used
    outputs["preds"] = preds
    outputs["completion_info"] = completion_info
    df = pd.DataFrame(outputs)
    df.to_csv(save_path)
    return df

MWOZ_dataset_single = MWOZ_DST(config=CONFIG,
                        path_to_db="/home/willy/InstrucTOD/MultiWOZ_2.1/",
                        with_ontology=True,
                        max_values_per_slot=25,
                        mwoz_path="MultiWOZ_2.1/",
                        dialog_history_limit=0,
                        single_domain_only=True,
                        domains="all",
                        debug=False)
dataset_single = MWOZ_dataset_single.dataset

FileNotFoundError: [Errno 2] No such file or directory: '/home/willy/InstrucTOD/MultiWOZ_2.1/taxi_db.json'

In [9]:
dataset_single["golds"][0]

{'Taxi-Inform': [['Dest', 'pizza hut fen ditton'],
  ['Depart', "saint john 's college"]]}

In [22]:
mwoz_path = "/home/willy/InstrucTOD/MultiWOZ_2.1/"
data_path = os.path.join(mwoz_path, "data.json")
testListFile_path = os.path.join(mwoz_path, "testListFile.txt")
ontology_path = os.path.join(mwoz_path, "ontology.json")
system_acts_path =  os.path.join(mwoz_path, "system_acts.json")

with open(data_path, "r") as f:
    all_data = json.load(f)

with open(ontology_path, "r") as f:
    ontology = json.load(f)

with open(system_acts_path, "r") as f:
    system_acts = json.load(f)

with open(testListFile_path, "r") as f:
    testfiles = f.read()
testfiles = testfiles.split("\n")

In [35]:
testfiles

['MUL0484.json',
 'PMUL4462.json',
 'PMUL0320.json',
 'MUL2155.json',
 'PMUL0815.json',
 'PMUL3263.json',
 'PMUL3672.json',
 'SNG0423.json',
 'SNG0296.json',
 'PMUL0079.json',
 'PMUL1484.json',
 'SNG0840.json',
 'PMUL0089.json',
 'PMUL2859.json',
 'PMUL2009.json',
 'SNG0528.json',
 'SNG01367.json',
 'PMUL3858.json',
 'SNG1076.json',
 'PMUL2166.json',
 'PMUL2436.json',
 'MUL0225.json',
 'PMUL1966.json',
 'PMUL4239.json',
 'SNG01290.json',
 'SNG0888.json',
 'SNG1150.json',
 'MUL2646.json',
 'PMUL4247.json',
 'SNG0589.json',
 'MUL2089.json',
 'SNG01434.json',
 'MUL0237.json',
 'PMUL4125.json',
 'MUL1588.json',
 'MUL0432.json',
 'SNG0253.json',
 'PMUL4998.json',
 'PMUL1323.json',
 'SNG0580.json',
 'PMUL2719.json',
 'PMUL3224.json',
 'PMUL4440.json',
 'PMUL4840.json',
 'SNG0081.json',
 'SNG02172.json',
 'PMUL0550.json',
 'PMUL3558.json',
 'PMUL2275.json',
 'PMUL3600.json',
 'SNG0323.json',
 'MUL0323.json',
 'MUL1137.json',
 'MUL2525.json',
 'MUL0409.json',
 'SNG01359.json',
 'PMUL1259.json'

In [31]:
all_data["PMUL0089.json"]["log"][3]

{'text': 'No.  They all have 4 stars.',
 'metadata': {'taxi': {'book': {'booked': []},
   'semi': {'leaveAt': '',
    'destination': '',
    'departure': '',
    'arriveBy': ''}},
  'police': {'book': {'booked': []}, 'semi': {}},
  'restaurant': {'book': {'booked': [], 'people': '', 'day': '', 'time': ''},
   'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}},
  'bus': {'book': {'booked': [], 'people': ''},
   'semi': {'leaveAt': '',
    'destination': '',
    'day': '',
    'arriveBy': '',
    'departure': ''}},
  'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
  'hotel': {'book': {'booked': [], 'people': '', 'day': '', 'stay': ''},
   'semi': {'name': 'not mentioned',
    'area': 'east',
    'parking': 'not mentioned',
    'pricerange': 'cheap',
    'stars': '2',
    'internet': 'not mentioned',
    'type': 'guesthouse'}},
  'attraction': {'book': {'booked': []},
   'semi': {'type': '', 'name': '', 'area': ''}},
  'train': {'book': {'booked': [], 'peopl

In [19]:
all_data["PMUL0089.json"]["log"][2]

{'text': 'Does it have a star rating of 2?',
 'metadata': {},
 'dialog_act': {'Hotel-Inform': [['Stars', '2']]},
 'span_info': [['Hotel-Inform', 'Stars', '2', 7, 7]]}

In [29]:
logs = all_data["PMUL0089.json"]["log"]
for idx, turn in enumerate(logs):
    print(idx, turn)

0 {'text': 'Can you help me find a cheap place to stay in the east part of town?', 'metadata': {}, 'dialog_act': {'Hotel-Inform': [['Price', 'cheap'], ['Area', 'east']]}, 'span_info': [['Hotel-Inform', 'Price', 'cheap', 6, 6], ['Hotel-Inform', 'Area', 'east', 12, 12]]}
1 {'text': "Sure. There are three guesthouses there. I'd be happy to book one for you if you like. ", 'metadata': {'taxi': {'book': {'booked': []}, 'semi': {'leaveAt': '', 'destination': '', 'departure': '', 'arriveBy': ''}}, 'police': {'book': {'booked': []}, 'semi': {}}, 'restaurant': {'book': {'booked': [], 'people': '', 'day': '', 'time': ''}, 'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}}, 'bus': {'book': {'booked': [], 'people': ''}, 'semi': {'leaveAt': '', 'destination': '', 'day': '', 'arriveBy': '', 'departure': ''}}, 'hospital': {'book': {'booked': []}, 'semi': {'department': ''}}, 'hotel': {'book': {'booked': [], 'people': '', 'day': '', 'stay': ''}, 'semi': {'name': 'not mentioned', 'area': '

In [113]:
#NEW
CONFIG = {
    "openai_api_key": os.environ["OPENAI_API_KEY"], #Put your own there
    "openai_organization": os.environ["OPENAI_API_ORG"], #Put your own there (needed afaik if using gpt4)
    "INSTRUCTIONS":{"instruction_with_extracted_ontology":"""You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following knowledge base as grounding for acts, slots and values, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with 'domain', 'act' and 'belief_state' as primary keys, for example {'domain':domain, 'act':act, 'belief_state':{first slot: first value, second slot: second value, etc...}:""",
                   "instruction_with_slots":"""You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following SLOTS provided, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with the slot name as the primary key, and the retrieved value associated to the slot, for example {'slot1':'value1', 'slot2':'value2', etc...}:""",
                   "instruction_with_slots_recorrect":"""You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following SLOTS provided, another faulty system already generated the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. You should generate the new and correct belief state. The results should be in json format with the slot name as the primary key, and the retrieved value associated to the slot, for example {'slot1':'value1', 'slot2':'value2', etc...}:""",
                   "instruction_query_database":"""""", #TODO
                   "instruction_response_generation":"""""" #TODO
                   }, 
    "PROMPT_TEMPLATES":{"template_with_extracted_ontology":{"template": """{instruction}\n\nKNOWLEDGE BASE:\n{ontology}\n\nCONTEXT:\n{dialogue_context}""",
                                                           "input_variables": ["instruction", "ontology", "dialogue_context"]},
                       "template_with_slots":{"template": """{instruction}\n\nSLOTS:\n{slots}\n\nCONTEXT:\n{dialogue_context}""",
                                                           "input_variables": ["instruction", "slots", "dialogue_context"]},
                       "template_with_slots_recorrect":{"template": """{instruction}\n\nSLOTS:\n{slots}\n\nCONTEXT:\n{dialogue_context}\n\nBELIEF STATES:\n{belief_states}\n\n\n""",
                                                           "input_variables": ["instruction", "slots", "dialogue_context, belief_states"]},
                       "template_query_database":{"template": """{instruction}\n\nBELIEF STATES:\n{belief_states}\n\nSELECT * FROM""",
                                                           "input_variables": ["instruction", "belief_states"]},
                       "template_response_generation":{"template": """{instruction}\n\nDIALOGUE ACTS:\n{dialogue_acts}\n\nCONTEXT:\n{dialogue_context}""",
                                                           "input_variables": ["instruction", "dialogue_acts", "dialogue_context"]},
                      },
    "multiwoz21":{
                "requestable_slots" : {"taxi": ["car", "phone"],
                                    "police": ["postcode", "address", "phone"],
                                    "hospital": ["address", "phone", "postcode"],
                                    "hotel": ["address", "postcode", "internet", "phone", "parking", "type", "pricerange", "stars", "area", "reference"],
                                    "attraction": ["price", "type", "address", "postcode", "phone", "area", "reference"],
                                    "train": ["time", "leave", "price", "arrive", "id", "reference"],
                                    "restaurant": ["phone", "postcode", "address", "pricerange", "food", "area", "reference"]
                                    },
                "informable_slots" : {"taxi": ["leave", "destination", "departure", "arrive"],
                                    "police": [],
                                    "hospital": ["department"],
                                    "hotel": ["type", "parking", "pricerange", "internet", "stay", "day", "people", "area", "stars", "name"],
                                    "attraction": ["area", "type", "name"],
                                    "train": ["destination", "day", "arrive", "departure", "people", "leave"],
                                    "restaurant": ["food", "pricerange", "area", "name", "time", "day", "people"]
                                    },
                "all_requestable_slots":["car", "address", "postcode", "phone", "internet",  "parking", "type", "pricerange", "food",
                                "stars", "area", "reference", "time", "leave", "price", "arrive", "id"],
                "all_informable_slots":["type", "parking", "pricerange", "internet", "stay", "day", "people", "area", "stars", "name",
                                "leave", "destination", "departure", "arrive", "department", "food", "time"],
                }
}


class PromptConstructor():
    def __init__(self, 
                 config):
        self.config = config
        self.instructions = config["INSTRUCTIONS"]
        self.prompt_templates = config["PROMPT_TEMPLATES"]
        
    def _get_slots_from_domains(self, domains):
        if domains == "all":
            req_slots = ", ".join(self.config["multiwoz21"]["all_requestable_slots"])
            inf_slots = ", ".join(self.config["multiwoz21"]["all_informable_slots"])
        elif not isinstance(domains, list):
            raise ValueError("""Provided domain should be either 'all' or list of valid domain names:
                                - for multiwoz2.1 and 2.4: taxi, restaurant, hotel, train, attraction 
                                - for SGD: To-do""")
        else:
            req_slots = ""
            inf_slots = ""
            domain_req_slots = []
            domain_inf_slots = []
            for domain in domains:
                domain_req_slots += self.config["multiwoz21"]["requestable_slots"][domain]
                domain_inf_slots += self.config["multiwoz21"]["informable_slots"][domain]
            domain_req_slots = set(domain_req_slots)
            domain_inf_slots = set(domain_inf_slots)
            req_slots += ", ".join(domain_req_slots)
            inf_slots += ", ".join(domain_inf_slots)
            req_slots, inf_slots = req_slots, inf_slots

        slots_info = f"Requestable slots: {req_slots}\nInformable slots: {inf_slots}"
        return slots_info
    
    
    def _build_prompt(self, mode="", dialogue_context="", ontology="", slots="", dialogue_acts="", belief_states=""):
        prompt = ""
        if mode == "dst":
            instruction = self.instructions["instruction_with_slots"]
            template_variables = self.prompt_templates["template_with_slots"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                     slots=slots,
                                     dialogue_context=dialogue_context)
            
        elif mode == "dst_recorrect":
            instruction = self.instructions["instruction_with_slots_recorrect"]
            template = self.prompt_templates["template_with_slots_recorrect"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])            
            prompt = template.format(instruction=instruction,
                                    slots=slots,
                                    dialogue_context=dialogue_context,
                                    belief_states=belief_states)
            
        elif mode == "database_query":
            instruction = self.instructions["instruction_query_database"]
            template = self.prompt_templates["template_query_database"]
            template = PromptTemplate(input_variables= template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                    belief_states=belief_states)
            
        elif mode == "response_generation":
            instruction = self.instructions["instruction_response_generation"]
            template = self.prompt_templates["template_response_generation"]
            template = PromptTemplate(input_variables = template_variables["input_variables"],
                                      template = template_variables["template"])
            prompt = template.format(instruction=instruction,
                                    dialogue_acts=dialogue_acts,
                                    dialogue_context=dialogue_context)
        elif mode == "dst_extracted_ontology":
            pass

        else:
            raise ValueError("'mode' should be one of: [dst, dst_recorrect, database_query, response_generation]")
        
        return prompt


class MWOZ_Dataset(PromptConstructor):
    def __init__(self,
                 config,
                 mwoz_path,
                 dialog_history_limit):
        PromptConstructor.__init__(self, config)
        self.dataset = {"id":[],
                        "dialogue_id":[],
                        "dialogue_context":[],
                        "turn":[],
                        "prompt":[],
                        "domains":[],
                        "gold_bs":[],
                        "gold_act":[],
                        "gold_response":[],
                        "gold_database_result":[],
                        }
        self.all_data, self.testfiles = self._get_mwoz_data(mwoz_path)
        self.idx = 0
        self.dialog_history_limit = dialog_history_limit
        print("Processing mwoz...")
        for sample in tqdm(self.all_data):
            if sample in testfiles:
                dialogue_log = self.all_data[sample]["log"]
                self._process_dialogue_log(sample=sample,
                                           dialogue_log=dialogue_log)

        self.dataset = pd.DataFrame(self.dataset)

        for index, row in tqdm(self.dataset.iterrows()):
            if len(row["domains"]) != 1:
                self.dataset.drop(index, inplace=True)

    def _get_mwoz_data(self, mwoz_path):
        data_path = os.path.join(mwoz_path, "data.json")
        testListFile_path = os.path.join(mwoz_path, "testListFile.txt")

        with open(data_path, "r") as f:
            all_data = json.load(f)
            
        with open(testListFile_path, "r") as f:
            testfiles = f.read()
        testfiles = testfiles.split("\n")
        return all_data, testfiles
    
    def _process_dialogue_log(self, sample, dialogue_log):

        dialog_history_memory = []
        dialog_history = ""
        domains = self._get_domains_from_log(dialogue_log)
        slots = self._get_slots_from_domains(domains) # or all

        for turn_nb, turn in enumerate(dialogue_log):

            if turn_nb % 2 == 0:
                speaker = "USER"
            else:
                speaker = "SYSTEM"

            utterance = f"""{speaker}: {turn["text"]}\n"""
            dialogue_context = dialog_history + utterance
            dialog_act = turn["dialog_act"]
            prompt = self._build_prompt(mode="dst",
                                        slots=slots,
                                        dialogue_context=dialogue_context) 


            if self.dialog_history_limit != 0:
                if self.dialog_history_limit == -1:
                    self.dialog_history_limit = len(dialogue_log)

                if len(dialog_history_memory) >= self.dialog_history_limit:
                    dialog_history_memory.pop(0)
                dialog_history_memory.append(utterance)
                dialog_history = "".join(dialog_history_memory)

            self.idx += 1
            if turn_nb % 2 == 0:
                self.dataset["gold_bs"].append(dialog_act)
                self.dataset["dialogue_context"].append(dialogue_context)
                self.dataset["gold_database_result"].append(None) 
                self.dataset["turn"].append(turn_nb//2)
                self.dataset["domains"].append(domains)
                self.dataset["id"].append(self.idx//2)
                self.dataset["dialogue_id"].append(sample)
                self.dataset["prompt"].append(prompt)
            else:
                self.dataset["gold_response"].append(utterance)
                self.dataset["gold_act"].append(dialog_act)


    def _get_domains_from_log(self, dialogue_log):
        domains = []
        all_domains = ["restaurant", "taxi", "hotel", "train", "attraction"]
        for log in dialogue_log:
            for domain_act in log["dialog_act"]:
                domain = domain_act.split("-")[0].lower()
                if domain in all_domains and domain not in domains:
                    domains.append(domain)
        return domains
                




In [114]:
mwoz_path = "/home/willy/InstrucTOD/MultiWOZ_2.1/"
dialog_history_limit = 0
mwoz = MWOZ_Dataset(CONFIG, 
                    mwoz_path,
                    dialog_history_limit)

Processing mwoz...


100%|██████████| 10438/10438 [00:00<00:00, 13340.50it/s]
7372it [00:06, 1202.85it/s]


In [115]:
dataset = mwoz.dataset
for key in dataset:
    print(f"""{key}: {len(dataset[key])}""")

id: 1059
dialogue_id: 1059
dialogue_context: 1059
turn: 1059
prompt: 1059
domains: 1059
gold_bs: 1059
gold_act: 1059
gold_response: 1059
gold_database_result: 1059


In [118]:
dataset["prompt"][0]

"You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following SLOTS provided, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with the slot name as the primary key, and the retrieved value associated to the slot, for example {'slot1':'value1', 'slot2':'value2', etc...}:\n\nSLOTS:\nRequestable slots: phone, car\nInformable slots: departure, leave, arrive, destination\n\nCONTEXT:\nUSER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.\n"

In [116]:
for i in range(10):
    for key in dataset:
        print(f"""{key}: {dataset[key][i]}""")
    print("================")

id: 0
dialogue_id: SNG0073.json
dialogue_context: USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.

turn: 0
prompt: You are a task-oriented dialogue system focusing on doing Dialogue State Tracking. Using the following SLOTS provided, generate the belief state of the last dialogue turn in the following conversation between a USER and a SYSTEM in a task-oriented dialogue setting. The results should be in json format with the slot name as the primary key, and the retrieved value associated to the slot, for example {'slot1':'value1', 'slot2':'value2', etc...}:

SLOTS:
Requestable slots: phone, car
Informable slots: departure, leave, arrive, destination

CONTEXT:
USER: I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.

domains: ['taxi']
gold_bs: {'Taxi-Inform': [['Dest', 'pizza hut fen ditton'], ['Depart', "saint john 's college"]]}
gold_act: {'Taxi-Request': [['Leave', '?'], ['Arrive', '?']]}
gold_response: SYSTEM: What time do you want to 

KeyError: 4

In [79]:
logs = all_data["PMUL0089.json"]["log"]
logs

[{'text': 'Can you help me find a cheap place to stay in the east part of town?',
  'metadata': {},
  'dialog_act': {'Hotel-Inform': [['Price', 'cheap'], ['Area', 'east']]},
  'span_info': [['Hotel-Inform', 'Price', 'cheap', 6, 6],
   ['Hotel-Inform', 'Area', 'east', 12, 12]]},
 {'text': "Sure. There are three guesthouses there. I'd be happy to book one for you if you like. ",
  'metadata': {'taxi': {'book': {'booked': []},
    'semi': {'leaveAt': '',
     'destination': '',
     'departure': '',
     'arriveBy': ''}},
   'police': {'book': {'booked': []}, 'semi': {}},
   'restaurant': {'book': {'booked': [], 'people': '', 'day': '', 'time': ''},
    'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}},
   'bus': {'book': {'booked': [], 'people': ''},
    'semi': {'leaveAt': '',
     'destination': '',
     'day': '',
     'arriveBy': '',
     'departure': ''}},
   'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
   'hotel': {'book': {'booked': [], 'people':

In [104]:
print(len(dataset))
copy = dataset.copy(deep=True)
copy.drop(copy[len(copy["domains"]) != 1].index, inplace=True)

7372


KeyError: True

In [112]:
copy[copy["domains"].str.contains(",")]

KeyError: "None of [Float64Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n              ...\n              nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],\n             dtype='float64', length=7372)] are in the [columns]"

In [121]:
dataset

Unnamed: 0,id,dialogue_id,dialogue_context,turn,prompt,domains,gold_bs,gold_act,gold_response,gold_database_result
0,0,SNG0073.json,USER: I would like a taxi from Saint John's co...,0,You are a task-oriented dialogue system focusi...,[taxi],"{'Taxi-Inform': [['Dest', 'pizza hut fen ditto...","{'Taxi-Request': [['Leave', '?'], ['Arrive', '...",SYSTEM: What time do you want to leave and wha...,
1,1,SNG0073.json,USER: I want to leave after 17:15.\n,1,You are a task-oriented dialogue system focusi...,[taxi],"{'Taxi-Inform': [['Leave', '17:15']]}","{'Taxi-Inform': [['Car', 'blue honda'], ['Phon...",SYSTEM: \nBooking completed! your taxi will be...,
2,2,SNG0073.json,USER: Thank you for all the help! I appreciate...,2,You are a task-oriented dialogue system focusi...,[taxi],"{'general-thank': [['none', 'none']]}","{'general-reqmore': [['none', 'none']]}",SYSTEM: You are welcome. Is there anything el...,
3,3,SNG0073.json,"USER: No, I am all set. Have a nice day. Bye.\n",3,You are a task-oriented dialogue system focusi...,[taxi],"{'general-bye': [['none', 'none']]}","{'general-bye': [['none', 'none']]}",SYSTEM: you too! thank you\n,
79,79,SNG01608.json,USER: Are there any Portuguese restaurants in ...,0,You are a task-oriented dialogue system focusi...,[restaurant],"{'Restaurant-Inform': [['Food', 'portuguese']]}","{'Restaurant-Inform': [['Food', 'Portuguese'],...",SYSTEM: Yes there is a Portuguese restaurant i...,
...,...,...,...,...,...,...,...,...,...,...
7337,7337,SNG02319.json,"USER: Thank you, good bye.\n",4,You are a task-oriented dialogue system focusi...,[taxi],"{'general-bye': [['none', 'none']]}","{'general-greet': [['none', 'none']]}",SYSTEM: Thank you for using Cambridge TownInfo...,
7338,7338,SNG0061.json,USER: I would like to book a taxi to leave aft...,0,You are a task-oriented dialogue system focusi...,[taxi],"{'Taxi-Inform': [['Leave', '11:00'], ['Dest', ...","{'Taxi-Request': [['Depart', '?']]}",SYSTEM: Where are you departing from?\n,
7339,7339,SNG0061.json,USER: from the gardenia\n,1,You are a task-oriented dialogue system focusi...,[taxi],"{'Taxi-Inform': [['Depart', 'the gardenia']]}","{'Taxi-Inform': [['Car', 'red skoda'], ['Phone...","SYSTEM: All right, a red skoda will pick you u...",
7340,7340,SNG0061.json,USER: Great! Thank you for your help! \n,2,You are a task-oriented dialogue system focusi...,[taxi],"{'general-thank': [['none', 'none']]}","{'general-reqmore': [['none', 'none']]}",SYSTEM: You're welcome! What else can I do for...,


In [4]:
data = json.load(open("/home/willy/instructod/MultiWOZ_2.1/data.json", "r"))

In [15]:
data["SNG01856.json"]["log"][3]

{'text': 'I found 1 cheap hotel for you that includes parking. Do you like me to book it?',
 'metadata': {'taxi': {'book': {'booked': []},
   'semi': {'leaveAt': '',
    'destination': '',
    'departure': '',
    'arriveBy': ''}},
  'police': {'book': {'booked': []}, 'semi': {}},
  'restaurant': {'book': {'booked': [], 'time': '', 'day': '', 'people': ''},
   'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}},
  'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
  'hotel': {'book': {'booked': [], 'stay': '', 'day': '', 'people': ''},
   'semi': {'name': 'not mentioned',
    'area': 'not mentioned',
    'parking': 'yes',
    'pricerange': 'cheap',
    'stars': 'not mentioned',
    'internet': 'not mentioned',
    'type': 'hotel'}},
  'attraction': {'book': {'booked': []},
   'semi': {'type': '', 'name': '', 'area': ''}},
  'train': {'book': {'booked': [], 'people': ''},
   'semi': {'leaveAt': '',
    'destination': '',
    'day': '',
    'arriveBy': '',
    