# OC-IA-P10 DATA EXPLORATION

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import json

In [3]:
frames = pd.read_json('frames/frames.json')
frames['turns']


0       [{'text': 'I'd like to book a trip to Atlantis...
1       [{'text': 'Hello, I am looking to book a vacat...
2       [{'text': 'Hello there i am looking to go on a...
3       [{'text': 'Hi I'd like to go to Caprica from B...
4       [{'text': 'Hello, I am looking to book a trip ...
                              ...                        
1364    [{'text': 'Hi I've got 9 days free and I'm loo...
1365    [{'text': 'I need to get to Fortaleza on Septe...
1366    [{'text': 'We're finally going on vacation isn...
1367    [{'text': 'Hi there, I'm looking for a place t...
1368    [{'text': 'I need to book a trip for the whole...
Name: turns, Length: 1369, dtype: object

Let's have a look at a conversation. For ease of reading the values are sent to a json file.

In [4]:
with open('sample.json', "w") as f:
    json.dump(frames.loc[0]['turns'], f, indent=4)

We'll use only user data (not wizard's answers), and among them we'll use the text and the acts_without_refs values. Let's have a look at the values we can find inside:

In [5]:
all_entities = set()
for conversation in tqdm(frames['turns'][:1]):
    for turn in conversation:
        for act in turn['labels']['acts']:
            for arg in act['args']:
                all_entities.add(arg['key'])
       
all_entities

  0%|          | 0/1 [00:00<?, ?it/s]

{'budget',
 'dst_city',
 'end_date',
 'flex',
 'intent',
 'n_adults',
 'or_city',
 'ref',
 'str_date'}


We'll specifically stick to the values we want to process in this MVP:
- departure city ('or_city')
- arrival city ('dest_city')
- departure date ('str_date')
- end date ('end_date')
- budget ('budget')

In [6]:
entities = [
    'budget',
    'or_city',
    'dst_city',
    'str_date',
    'end_date',
    'intent',
 ]


Example of json file format for training LUIS:

```Python
[
  {
    "text": "order a pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "order a large pepperoni pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 14,
        "endCharIndex": 28
      },
      {
        "entityName": "Size",
        "startCharIndex": 8,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "I want two large pepperoni pizzas on thin crust",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 17,
        "endCharIndex": 32
      },
      {
        "entityName": "Size",
        "startCharIndex": 11,
        "endCharIndex": 15
      },
      {
        "entityName": "Quantity",
        "startCharIndex": 7,
        "endCharIndex": 9
      },
      {
        "entityName": "Crust",
        "startCharIndex": 37,
        "endCharIndex": 46
      }
    ]
  }
]
```

Let's extract data into the required json format:

In [7]:
def extract_data(turn):
    """
    Return dict of entities from turn.
    """

    data = dict.fromkeys(['text', 'intentName', 'entityLabels'])
    data['text'] = turn['text']
    intent_name = 'None'
    entity_labels = []
    for act in turn['labels']['acts_without_refs']:

        if 'name' in act:
            intent_name = act['name']

        for item in act['args']:
            entity_name = item['key']
            if entity_name in entities:
                entity_value = item['val']
                if entity_value and entity_value in turn['text']:
                    start_char_index = turn['text'].index(entity_value)
                    end_char_index = start_char_index + len(entity_value)
                    entity_labels.append({'entityName': entity_name, 
                                        'startCharIndex': start_char_index, 
                                        'endCharIndex': end_char_index})
    data['intentName'] = intent_name
    data['entityLabels'] = entity_labels
    return data

In [8]:
extract_data(frames['turns'][0][0])

{'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'intentName': 'inform',
 'entityLabels': [{'entityName': 'intent',
   'startCharIndex': 12,
   'endCharIndex': 16},
  {'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35},
  {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
  {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77},
  {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121}]}

In [9]:
all_data = []
for conversation in tqdm(frames['turns']):
    for turn in conversation:
        if turn['author']=='user':
            data = extract_data(turn)
            all_data.append(data)

  0%|          | 0/1369 [00:00<?, ?it/s]

Let's have a look at the result:

In [10]:
all_data[:5]

[{'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'intentName': 'inform',
  'entityLabels': [{'entityName': 'intent',
    'startCharIndex': 12,
    'endCharIndex': 16},
   {'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35},
   {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
   {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77},
   {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121}]},
 {'text': 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
  'intentName': 'inform',
  'entityLabels': [{'entityName': 'dst_city',
    'startCharIndex': 24,
    'endCharIndex': 33},
   {'entityName': 'budget', 'startCharIndex': 114, 'endCharIndex': 118},
   {'entityName': 'or_city', 'startCharIndex': 39, 'endCharIndex': 46},
   {'entityName': 'str_date', 'startCharIndex': 5

What are the different intentName values?

In [11]:
intents = set()
for data in all_data:
    intents.add(data['intentName'])
intents    

{'None',
 'affirm',
 'confirm',
 'goodbye',
 'greeting',
 'inform',
 'moreinfo',
 'negate',
 'request',
 'request_alts',
 'request_compare',
 'switch_frame',
 'thankyou'}

Let's have a look at examples:

In [12]:
sample_counts = dict.fromkeys(intents, 0)
for data in all_data[100:300]:
    intent = data['intentName']
    if sample_counts[intent] < 2:
        print('\nINTENT:', intent)
        print('TEXT:', data['text'])
        print('NB OF ENTITY LABELS:', len(data['entityLabels']))
        sample_counts[intent] += 1
        




INTENT: inform
TEXT: What about Neverland on the same budget?
NB OF ENTITY LABELS: 1

INTENT: inform
TEXT: No I cannot do another location. What if I increased my budget by $400?
NB OF ENTITY LABELS: 1

INTENT: thankyou
TEXT: Ok thanks.
NB OF ENTITY LABELS: 0

INTENT: switch_frame
TEXT: what is the hotel like?
NB OF ENTITY LABELS: 0

INTENT: request
TEXT: what type of flight is that going to be?
NB OF ENTITY LABELS: 0

INTENT: thankyou
TEXT: Thank you
NB OF ENTITY LABELS: 0

INTENT: switch_frame
TEXT: Do you have any information on what activities are included in the package?
NB OF ENTITY LABELS: 0

INTENT: request
TEXT: Is this a direct flight to Kingston or do we have to make a transfer? Are the tickets business class?
NB OF ENTITY LABELS: 1

INTENT: request_compare
TEXT: How many stars do the hotels have and what's included with the hotel?
NB OF ENTITY LABELS: 0

INTENT: request_alts
TEXT: I prefer 3 stars and above. I'm flexible with my budget, is there anything else in Columbo?
N

In [13]:
all_data_df = pd.DataFrame(all_data)
all_data_df

Unnamed: 0,text,intentName,entityLabels
0,I'd like to book a trip to Atlantis from Capri...,inform,"[{'entityName': 'intent', 'startCharIndex': 12..."
1,"Yes, how about going to Neverland from Caprica...",inform,"[{'entityName': 'dst_city', 'startCharIndex': ..."
2,I have no flexibility for dates... but I can l...,negate,"[{'entityName': 'or_city', 'startCharIndex': 5..."
3,I suppose I'll speak with my husband to see if...,thankyou,[]
4,"Hello, I am looking to book a vacation from Go...",greeting,"[{'entityName': 'intent', 'startCharIndex': 23..."
...,...,...,...
10402,"5 adults and 7 kids! Yup, the lot of us. We wa...",inform,"[{'entityName': 'budget', 'startCharIndex': 71..."
10403,Oh yes! Between September 12 and 26!,inform,"[{'entityName': 'str_date', 'startCharIndex': ..."
10404,"That sounds amazing, and it's within those dat...",request,[]
10405,"Ok perfect, book me!",inform,"[{'entityName': 'intent', 'startCharIndex': 12..."


In [14]:
all_data_df.describe(include='all')

Unnamed: 0,text,intentName,entityLabels
count,10407,10407,10407
unique,9695,13,3565
top,Thanks!,inform,[]
freq,73,6162,4803


We do not need all the utterances to train LUIS: according to the documentation, we can train LUIS on a subset of the utterances, provided that we have a good understanding of the intent. To do so, we'll use the intentName value to filter the utterances and keep those that convey most information: i.e. those where all the entities are present.

Let's extract which entities are present in the utterances:

In [15]:
def available_entities(labels):
    """
    Return list of available entities in data.
    """

    entities_in_labels = dict.fromkeys(entities, False)
    for label in labels:
        entities_in_labels[label['entityName']] = True
    return entities_in_labels    

In [16]:
all_available_entities = [available_entities(labels) for labels in all_data_df['entityLabels']]
all_data_df = pd.concat((all_data_df, pd.DataFrame(all_available_entities)), axis=1)
all_data_df

Unnamed: 0,text,intentName,entityLabels,budget,or_city,dst_city,str_date,end_date,intent
0,I'd like to book a trip to Atlantis from Capri...,inform,"[{'entityName': 'intent', 'startCharIndex': 12...",True,True,True,True,False,True
1,"Yes, how about going to Neverland from Caprica...",inform,"[{'entityName': 'dst_city', 'startCharIndex': ...",True,True,True,True,False,False
2,I have no flexibility for dates... but I can l...,negate,"[{'entityName': 'or_city', 'startCharIndex': 5...",False,True,False,False,False,False
3,I suppose I'll speak with my husband to see if...,thankyou,[],False,False,False,False,False,False
4,"Hello, I am looking to book a vacation from Go...",greeting,"[{'entityName': 'intent', 'startCharIndex': 23...",True,True,True,False,False,True
...,...,...,...,...,...,...,...,...,...
10402,"5 adults and 7 kids! Yup, the lot of us. We wa...",inform,"[{'entityName': 'budget', 'startCharIndex': 71...",True,False,False,False,False,False
10403,Oh yes! Between September 12 and 26!,inform,"[{'entityName': 'str_date', 'startCharIndex': ...",False,False,False,True,True,False
10404,"That sounds amazing, and it's within those dat...",request,[],False,False,False,False,False,False
10405,"Ok perfect, book me!",inform,"[{'entityName': 'intent', 'startCharIndex': 12...",False,False,False,False,False,True


In [17]:
all_data_df['total'] = all_data_df[entities].astype(int).sum(axis=1)

In [18]:
complete_samples = all_data_df[all_data_df['total']>4]
complete_samples.shape

(54, 10)

In [19]:
complete_samples.head()

Unnamed: 0,text,intentName,entityLabels,budget,or_city,dst_city,str_date,end_date,intent,total
0,I'd like to book a trip to Atlantis from Capri...,inform,"[{'entityName': 'intent', 'startCharIndex': 12...",True,True,True,True,False,True,5
274,I would like a vacation for one in Mannheim fr...,inform,"[{'entityName': 'dst_city', 'startCharIndex': ...",True,True,True,True,True,False,5
1034,"Hey, I'm looking to book my honeymoon from Hou...",inform,"[{'entityName': 'intent', 'startCharIndex': 20...",True,True,True,True,False,True,5
1097,Hi. I'd like to book a trip from Nagoya to Pho...,inform,"[{'entityName': 'intent', 'startCharIndex': 16...",False,True,True,True,True,True,5
1119,I'm looking to book a trip to Chicago from Bra...,inform,"[{'entityName': 'intent', 'startCharIndex': 15...",False,True,True,True,True,True,5


In [20]:
train_samples_df = complete_samples.sample(frac=0.5, random_state=42)
test_samples_df = complete_samples.drop(train_samples_df.index)

In [21]:
train_data = [all_data[i] for i in train_samples_df.index]
test_data = [all_data[i] for i in test_samples_df.index]

In [22]:
train_data[:2]

[{'text': "You can help me by booking a flight out of Cordoba to Salvador for me and my friend. Leaving ASAP and back before September 8. Oh and I'll need the best hotels you can get",
  'intentName': 'inform',
  'entityLabels': [{'entityName': 'intent',
    'startCharIndex': 19,
    'endCharIndex': 23},
   {'entityName': 'or_city', 'startCharIndex': 43, 'endCharIndex': 50},
   {'entityName': 'dst_city', 'startCharIndex': 54, 'endCharIndex': 62},
   {'entityName': 'str_date', 'startCharIndex': 93, 'endCharIndex': 97},
   {'entityName': 'end_date', 'startCharIndex': 114, 'endCharIndex': 125}]},
 {'text': 'rome to goiania and i will go from september eight until twenty five\nand i have 4900 dollars in budget',
  'intentName': 'inform',
  'entityLabels': [{'entityName': 'or_city',
    'startCharIndex': 0,
    'endCharIndex': 4},
   {'entityName': 'dst_city', 'startCharIndex': 8, 'endCharIndex': 15},
   {'entityName': 'str_date', 'startCharIndex': 35, 'endCharIndex': 50},
   {'entityName':

In [None]:
with open('train_data.json', "w") as f:
    json.dump(train_data, f, indent=4)

with open('test_data.json', "w") as f:
    json.dump(test_data, f, indent=4)

In [23]:
ask_for = {
    "str_date": "when do you want to go?",
    "end_date": "when do you want to come back?",
    "dst_city": "where do you want to fly to?",
    "or_city": "where do you want to depart from?",
    "budget": "what is your budget?",
    }
