In [14]:
import os
import json
import openai
import pandas as pd
from transformers.models.t5.tokenization_t5_fast import T5TokenizerFast

# SGD loader

Loader to parse each user turn of the test set of SGD into:
- dialogue history
- gold belief state
- gold response
- domain of the given turn
- domains of the dialogue

Please see below __Viewing__ sections for more details

Implementations are refering to [this repo](https://github.com/cuthalionn/Prompter/tree/main), especially in:
- [read_SGD](https://github.com/cuthalionn/Prompter/blob/f240296c73a0429cf52c1268c7fd2139680370d6/src/prefix_data_loader.py#L78)
- [download_sgd.sh](https://github.com/cuthalionn/Prompter/blob/main/download_sgd.sh)
- [T5 tokenizer used in the repo](https://github.com/cuthalionn/Prompter/blob/main/src/config.py)

__Run download_sgd.sh first, before running this notebook__

In [1]:
!bash download_sgd.sh

mkdir: cannot create directory ‘data’: File exists
fatal: destination path 'dstc8-schema-guided-dialogue' already exists and is not an empty directory.


In [13]:
def completion(model_args, prompt):
    if "gpt-3.5-turbo" in model_args.model_name_or_path or "gpt-4" in model_args.model_name_or_path:
        completion = openai.ChatCompletion.create(
            model=model_args.model_name_or_path.replace("openai/", ""),
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        response = completion.choices[0].message.content.strip()
    else:
        completion = openai.Completion.create(
            model=model_args.model_name_or_path.replace("openai/", ""),
            prompt=prompt,
        )
        response = completion.choices[0].text.strip()
    return response

### Helper functions

In [2]:
def adjust_sgd_questions(schema):
    if "Hotels_2" in schema:
        schema["Hotels_2"]["where_to"] = ("which city are user planning to stay in?", schema["Hotels_2"]["where_to"][1])
        schema["Hotels_2"]["has_laundry_service"] = ("whether the house has laundry service?", schema["Hotels_2"]["has_laundry_service"][1])
    if "Hotels_4" in schema:
        schema["Hotels_4"]["location"] = ("what is the city or town where the hotel is located?", schema["Hotels_4"]["location"][1])
        schema["Hotels_4"]["star_rating"] = ("what is the star rating of the hotel?", schema["Hotels_4"]["star_rating"][1])
        schema["Hotels_4"]["place_name"] = ("what is the name of the hotel?", schema["Hotels_4"]["place_name"][1])
    if "Media_3" in schema:    
        schema["Media_3"]["genre"] = ("what type of the movie does user prefer?", schema["Media_3"]["genre"][1])
        schema["Media_3"]["starring"] = ("who is the actor in this movie?", schema["Media_3"]["starring"][1])
    if "Services_4" in schema:
        schema["Services_4"]["city"] = ("what is the city or area where user wants to search for a therapist?", schema["Services_4"]["city"][1])
    if "Music_3" in schema:
        schema["Music_3"]["artist"] = ("what is the name of the artist?", schema["Music_3"]["artist"][1])
        schema["Music_3"]["album"] = ("what is the album of the song?", schema["Music_3"]["album"][1])
    return schema

# preprocess SGD
def read_SGD(args, path_name, tokenizer, dataset=None):
    choice_token = " <extra_id_0> "
    # read test set
    all_data = []
    # read from original data
    for filename in os.listdir(os.path.join(path_name,dataset)):
        if filename.startswith("dialogues_"):
            with open(os.path.join(path_name,dataset,filename)) as f:
                data = json.load(f)
                all_data+=data
    global_tokens = []
    if dataset == "train":
        global_tokens = find_global_tokens_SGD(all_data)

    with open(os.path.join(path_name,dataset,"schema.json")) as f:
        data = json.load(f)
        check_list = ["what", "how", "whether", "which"]
        schema = {}
        for service in data:
            schema[service["service_name"]] = {}
            # collect required_slots and optional_slots
            slot_collection = []
            for intent in service["intents"]:
                for slot in intent["required_slots"]:
                    slot_collection.append(slot)
                for slot in intent["optional_slots"].keys():
                    slot_collection.append(slot)

            for slot in service["slots"]:
                description = slot["description"].lower()
                if any(c_l in description for c_l in check_list):
                    description = f"{description}?"
                else:
                    description = f"what is the {description}?"

                if slot["name"] in slot_collection:
                    schema[service["service_name"]][slot["name"]] = (description, slot["possible_values"])

    schema = adjust_sgd_questions(schema)


    p_data = []
    # read dialogues
    for ID, dial in enumerate(all_data):
        #print(ID)
        dialog_history = ""

        for idx, turn in enumerate(dial["turns"]):
            utterance = turn["utterance"]
            utterance = fix_number(utterance)
            # User start the conversation
            if turn["speaker"] == "USER":
                assert idx%2==0
                turn_belief_list = generate_belief_list(turn)
                
                # accumulate dialogue utterances
                #dialog_history +=  (" System: " + turn["system"] + " User: " + turn["user"])
                dialog_history +=  (" User: " + utterance)


                for fid, frame in enumerate(turn["frames"]):
                    # read slot values
                    for k in schema[frame["service"]]:
                        value_text = frame["state"]["slot_values"].get(k, ['none'])[0]
                        output_text = value_text + f" {tokenizer.eos_token}"
                    # for k, v in frame["state"]["slot_values"].items():
                        slot_text = k
                        question = schema[frame["service"]][k][0]
                        data_detail = {
                            "ID":dial["dialogue_id"],
                            "domains":dial["services"],
                            "domain":frame["service"],
                            "turn_id":idx,
                            "dialog_history":dialog_history,
                            "output_text":output_text,
                            "turn_belief":turn_belief_list,
                            "slot_text":slot_text,
                            "value_text":value_text,
                            "slot_domain": frame["service"],
                            "slot_description": question
                            }
                        p_data.append(data_detail)
            # system turn
            else:
                assert idx%2==1
                dialog_history +=  (" Speaker: " + utterance)


    # with open(os.path.join("test",f"output.json"), 'w') as fout:
    #     json.dump(all_data, fout, indent=4)

    return p_data,global_tokens

def fix_number(text):
    number_mapper = {"one": "1", "two": "2", "three":"3", "four":"4", "five":"5", "six":"6", "seven":"7", "eight":"8", "nine":"9", "ten":"10", "eleven":"11", "twelve":"12"}
    for fromx, tox in number_mapper.items():
        text = ' ' + text + ' '
        text = text.replace(f" {fromx} ", f" {tox} ")[1:-1]
    return text

def generate_belief_list(turn):
    belief_list = []
    for frame in turn["frames"]:
        # read slot values
        service = frame["service"]
        for slot_name,slot_value in frame["state"]["slot_values"].items():
            belief_list.append("-".join([service,slot_name,slot_value[0]]))
    return belief_list

## Loading SGD

In [3]:
tokenizer = T5TokenizerFast.from_pretrained("t5-small",
                                            bos_token="[bos]",
                                            eos_token="[eos]",
                                            sep_token="[sep]")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
path = 'data/dstc8-schema-guided-dialogue'

# data_train, global_tokens = read_SGD(args = None, path_name = path, tokenizer = tokenizer, dataset = "train")
# data_dev,_ = read_SGD(args = None, path_name = path, tokenizer = tokenizer, dataset = "dev")
data_test,_ = read_SGD(args = None, path_name = path, tokenizer = tokenizer, dataset = "test")

In [5]:
def get_descriptions(schema_file):
    schemas = json.load(open(schema_file))
    descriptions = {}
    for service in schemas:
        service_name = service["service_name"]
        for slot in service["slots"]:
            slot_name = slot["name"]
            slot_description = slot["description"]
            
            descriptions["-".join([service_name,slot_name])] = slot_description
    return descriptions

ALL_SLOTS = list(get_descriptions(os.path.join(path,"test","schema.json")).keys())

## Viewing

Loader to parse each user turn of the test set of SGD into: <br>
- [x] dialogue history (__dialog_history__)
- [x] gold belief state (__turn_belief__)
- [x] gold response (__output_text__)
- [x] domain of the given turn (__domain__)
- [x] domains of the dialogue (__domains__)

In [6]:
import pandas as pd

In [7]:
data_test[7]

{'ID': '27_00000',
 'domains': ['Trains_1', 'Events_3', 'Travel_1', 'Hotels_2'],
 'domain': 'Trains_1',
 'turn_id': 2,
 'dialog_history': ' User: I want to find a train. Speaker: What city do you want to leave from and when? Where are you going? User: I want to find tickets to Seattle, WA from Portland next Wednesday.',
 'output_text': 'Portland [eos]',
 'turn_belief': ['Trains_1-date_of_journey-next Wednesday',
  'Trains_1-from-Portland',
  'Trains_1-to-Seattle, WA'],
 'slot_text': 'from',
 'value_text': 'Portland',
 'slot_domain': 'Trains_1',
 'slot_description': 'what is the starting city for train journey?'}

In [22]:
df = pd.DataFrame(data_test)

In [8]:
pd.DataFrame(data_test).iloc[5:10]

Unnamed: 0,ID,domains,domain,turn_id,dialog_history,output_text,turn_belief,slot_text,value_text,slot_domain,slot_description
5,27_00000,"[Trains_1, Events_3, Travel_1, Hotels_2]",Trains_1,0,User: I want to find a train.,none [eos],[],class,none,Trains_1,what is the fare class for train reservation?
6,27_00000,"[Trains_1, Events_3, Travel_1, Hotels_2]",Trains_1,0,User: I want to find a train.,none [eos],[],trip_protection,none,Trains_1,"whether to add trip protection to reservation,..."
7,27_00000,"[Trains_1, Events_3, Travel_1, Hotels_2]",Trains_1,2,User: I want to find a train. Speaker: What c...,Portland [eos],"[Trains_1-date_of_journey-next Wednesday, Trai...",from,Portland,Trains_1,what is the starting city for train journey?
8,27_00000,"[Trains_1, Events_3, Travel_1, Hotels_2]",Trains_1,2,User: I want to find a train. Speaker: What c...,"Seattle, WA [eos]","[Trains_1-date_of_journey-next Wednesday, Trai...",to,"Seattle, WA",Trains_1,what is the ending city for train journey?
9,27_00000,"[Trains_1, Events_3, Travel_1, Hotels_2]",Trains_1,2,User: I want to find a train. Speaker: What c...,next Wednesday [eos],"[Trains_1-date_of_journey-next Wednesday, Trai...",date_of_journey,next Wednesday,Trains_1,what is the date of train journey?


In [9]:
len(pd.DataFrame(data_test)["ID"].unique())

4201

In [10]:
count = 0
prev_id = ""
for sample in data_test:
    cur_id = sample["ID"] + str(sample["turn_id"])

    if cur_id != prev_id:
        print("context", sample["dialog_history"])
        #reset
    print(sample["slot_text"], sample["value_text"])

    prev_id = cur_id

    count += 1
    if count == 20:
        break

context  User: I want to find a train.
from none
to none
date_of_journey none
journey_start_time none
number_of_adults none
class none
trip_protection none
context  User: I want to find a train. Speaker: What city do you want to leave from and when? Where are you going? User: I want to find tickets to Seattle, WA from Portland next Wednesday.
from Portland
to Seattle, WA
date_of_journey next Wednesday
journey_start_time none
number_of_adults none
class none
trip_protection none
context  User: I want to find a train. Speaker: What city do you want to leave from and when? Where are you going? User: I want to find tickets to Seattle, WA from Portland next Wednesday. Speaker: I found 9 trains. There is a train departing at 9:10 am for $61. User: Sounds great. I want to find something cool to do that day.
event_type none
event_name none
date none
number_of_tickets none
city Seattle, WA
from Portland


In [11]:
data_test[0]

{'ID': '27_00000',
 'domains': ['Trains_1', 'Events_3', 'Travel_1', 'Hotels_2'],
 'domain': 'Trains_1',
 'turn_id': 0,
 'dialog_history': ' User: I want to find a train.',
 'output_text': 'none [eos]',
 'turn_belief': [],
 'slot_text': 'from',
 'value_text': 'none',
 'slot_domain': 'Trains_1',
 'slot_description': 'what is the starting city for train journey?'}

In [16]:
len(data_test)

267684

In [23]:
df["ID"].nunique()

4201

In [27]:
df["slot_text"].unique()

array(['from', 'to', 'date_of_journey', 'journey_start_time',
       'number_of_adults', 'class', 'trip_protection', 'event_type',
       'event_name', 'date', 'number_of_tickets', 'city', 'location',
       'category', 'free_entry', 'good_for_kids', 'where_to',
       'check_in_date', 'check_out_date', 'rating', 'has_laundry_service',
       'show_type', 'theater_name', 'show_time', 'show_date', 'genre',
       'movie_name', 'title', 'subtitle_language', 'starring',
       'from_city', 'to_city', 'departure_date', 'departure_time',
       'additional_luggage', 'num_passengers', 'number_of_rooms',
       'stay_length', 'star_rating', 'place_name', 'smoking_allowed',
       'cast', 'directed_by', 'restaurant_name', 'time',
       'has_seating_outdoors', 'has_vegetarian_options',
       'number_of_seats', 'price_range', 'new_alarm_time',
       'new_alarm_name', 'car_type', 'pickup_location', 'start_date',
       'pickup_time', 'end_date', 'add_insurance', 'contact_name',
       'seating

In [25]:
df["domain"].nunique()

21