In [1]:
import pandas as pd
import itertools
from six.moves import cPickle

## Load the frames dataset

In [2]:
data = pd.read_json("frames.json")
print(data.iloc[0])

user_id                                              U22HTHYNP
turns        [{'text': 'I'd like to book a trip to Atlantis...
wizard_id                                            U21DKG18C
id                        e2c0fc6c-2134-4891-8353-ef16d8412c9a
labels       {'userSurveyRating': 4.0, 'wizardSurveyTaskSuc...
Name: 0, dtype: object


## Grab all the turns to form one chat list

In [3]:
data['chat'] = data['turns'].apply(lambda x: [item['text'] for item in x])
print(data['chat'].iloc[0])

["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.", 'Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?', 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.', 'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?', 'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?', 'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?', "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help"]


## Grab the user's utterances and the bot's utterances

In [4]:
data['user'] = data['chat'].apply(lambda x: x[0::2])
data['bot'] = data['chat'].apply(lambda x: x[1::2])
print(data['user'].iloc[0], data['bot'].iloc[0])

["I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.", 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.', 'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?', "I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help"] ['Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?', 'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?', 'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?']


## Zip the data to contain (User utterance, Bot utterance)
### And pad when lengths missmatch

In [5]:
dataset = data.apply(lambda x: list(itertools.zip_longest(x['user'], x['bot'], fillvalue='')), axis=1)
print(dataset.iloc[0])

[("I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.", 'Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?'), ('Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.', 'I checked the availability for this date and there were no trips available.  Would you like to select some alternate dates?'), ('I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How about that?', 'I checked the availability for that date and there were no trips available.  Would you like to select some alternate dates?'), ("I suppose I'll speak with my husband to see if we can choose other dates, and then I'll come back to you.Thanks for your help", '')]


## Create one full dataset of all the (User utterance, Bot utterance) pairs

In [6]:
all_convos = []
_ = dataset.apply(lambda x: all_convos.extend(x))

## Save the data using cPickle

In [7]:
cPickle.dump(all_convos, open("all_convos.pkl", "wb"))