# OC-IA-P10 DATA EXPLORATION

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import json

In [3]:
frames = pd.read_json('frames/frames.json')
frames['turns']


0       [{'text': 'I'd like to book a trip to Atlantis...
1       [{'text': 'Hello, I am looking to book a vacat...
2       [{'text': 'Hello there i am looking to go on a...
3       [{'text': 'Hi I'd like to go to Caprica from B...
4       [{'text': 'Hello, I am looking to book a trip ...
                              ...                        
1364    [{'text': 'Hi I've got 9 days free and I'm loo...
1365    [{'text': 'I need to get to Fortaleza on Septe...
1366    [{'text': 'We're finally going on vacation isn...
1367    [{'text': 'Hi there, I'm looking for a place t...
1368    [{'text': 'I need to book a trip for the whole...
Name: turns, Length: 1369, dtype: object

Let's have a look at a conversation. For ease of reading the values are sent to a json file.

In [4]:
with open('sample.json', "w") as f:
    json.dump(frames.loc[0]['turns'], f, indent=4)

We'll use only user data (not wizard's answers), and among them we'll use the text and the acts_without_refs values. Let's have a look at the values we can find inside:

In [5]:
all_entities = set()
for conversation in tqdm(frames['turns'][:1]):
    for turn in conversation:
        for act in turn['labels']['acts']:
            for arg in act['args']:
                all_entities.add(arg['key'])
       
all_entities

  0%|          | 0/1 [00:00<?, ?it/s]

{'budget',
 'dst_city',
 'end_date',
 'flex',
 'intent',
 'n_adults',
 'or_city',
 'ref',
 'str_date'}


We'll specifically stick to the values we want to process in this MVP:
- departure city ('or_city')
- arrival city ('dest_city')
- departure date ('str_date')
- end date ('end_date')
- budget ('budget')

In [6]:
entities = [
    'budget',
    'or_city',
    'dst_city',
    'str_date',
    'end_date',
    'intent',
 ]



Moreover, since in this MVP we do not want to manage any historical aspect, we'll only keep the first utterance of each conversation.

Example of json file format for training LUIS:

```Python
[
  {
    "text": "order a pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "order a large pepperoni pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 14,
        "endCharIndex": 28
      },
      {
        "entityName": "Size",
        "startCharIndex": 8,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "I want two large pepperoni pizzas on thin crust",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 17,
        "endCharIndex": 32
      },
      {
        "entityName": "Size",
        "startCharIndex": 11,
        "endCharIndex": 15
      },
      {
        "entityName": "Quantity",
        "startCharIndex": 7,
        "endCharIndex": 9
      },
      {
        "entityName": "Crust",
        "startCharIndex": 37,
        "endCharIndex": 46
      }
    ]
  }
]
```

In [7]:
def merge_elements(list_of_same_entities):
    """
    Merge entities of same category into one. E.g. (for the sentence
    'I want to go to Paris on the 5th of May'):

    list_of_same_entities = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 31,
        },
        {
            'entityName': 'str_date',
            'startCharIndex': 36,  # index of 'May'
            'endCharIndex': 38,
        },  
    ]
    output = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 38, # end index of 'May'
        },  
    ]   
        
    """
    list_of_same_entities = sorted(list_of_same_entities, key=lambda x: x['startCharIndex'])
    entity_name = list_of_same_entities[0]['entityName']
    start_index = list_of_same_entities[0]['startCharIndex']
    end_index = list_of_same_entities[-1]['endCharIndex']
    result = [{    
        'entityName': entity_name,
        'startCharIndex': start_index,
        'endCharIndex': end_index,
    }]
    return result 

assert merge_elements([
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 31,
        },
        {
            'entityName': 'str_date',
            'startCharIndex': 36,  # index of 'May'
            'endCharIndex': 38,
        },  
                    ]) == [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 38, # end index of 'May'
        }   
                        ]


In [8]:
def extract_args(text, arg):
    """
    Formats args of arg into a list of dicts as follows:
    [
        {
            'entityName': '...',
            'startCharIndex': ...,
            'endCharIndex': ...,
        },  
    ]
    """
    results = []
    for item in arg['args']:
        key = item['key']
        val = item['val']
        if not (val and isinstance(val, str) and val in text):
            continue
            
        start_index = text.index(val)
        end_index = start_index + len(val)
        results.append({
            'entityName': key,
            'startCharIndex': start_index,
            'endCharIndex': end_index,  
        })
    
    # Some data, such as dates, may be split into multiple entities,
    # we need to merge them
    merged_results = []
    for entity in entities:
        elements_for_entity = [item for item in results 
                                if item['entityName'] == entity]
        if len(elements_for_entity) > 1:
            merged_results.extend(merge_elements(elements_for_entity))
        elif len(elements_for_entity) == 1:
            merged_results.extend(elements_for_entity)
    return merged_results   

   
    
def extract_intent_name(arg):
    return arg['name'] 


def extract_data(sentence):
    """
    Return selected data from sentence
    (a round of a counversation) in following format:
    {
        'intent': ['...'],
        'entities': [
            {
                'entityName': '...',
                'startCharIndex': ...,
                'endCharIndex': ...,
            },
        ],
    }
    """
    data = dict()
    text = sentence['text']
    args = sentence['labels']['acts_without_refs']
    entities = []
    data['text'] = text
    data['intentName'] = set([extract_intent_name(arg) for arg in args])
    for arg in args:
        entities.extend(extract_args(text, arg))
    data['entityLabels'] = entities
    return data

    

In [9]:
text = "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."
arg_1 = {'args': [
                {'val': 'book', 'key': 'intent'}
                ], 
        'name': 'inform'}
arg_2 = {'args': [
                {'val': 'Caprica', 'key': 'or_city'}, 
                {'val': 'Saturday', 'key': 'str_date'}, 
                {'val': 'August', 'key': 'str_date'}, 
                {'val': '2016', 'key': 'str_date'}, 
                {'val': '8', 'key': 'adults'}, 
                {'val': '1700', 'key': 'budget'}
                ], 
        'name': 'request'}

assert extract_args(text, arg_1) == [{'entityName': 'intent', 'startCharIndex': 12, 'endCharIndex': 16}]
assert extract_args(text, arg_2) == [
        {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
        {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
        {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}]

In [10]:
excerpt_1 = {'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'budget': [{'val': '1700.0', 'negated': False}],
     'dst_city': [{'val': 'Atlantis', 'negated': False}],
     'or_city': [{'val': 'Caprica', 'negated': False}],
     'str_date': [{'val': 'august 13', 'negated': False}],
     'n_adults': [{'val': '8', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471272019730.0}

excerpt_2 = {'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'dst_city': [{'val': 'Gotham City', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471273579715.0}


assert extract_intent_name(excerpt_1['labels']['acts'][0]) == 'inform'
assert extract_intent_name(excerpt_2['labels']['acts'][0]) == 'inform'

assert extract_data(excerpt_1) == {

  'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'intentName': {'inform'},
  'entityLabels': [
      {'entityName': 'intent', 'startCharIndex': 12,'endCharIndex': 16},
      {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
      {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
      {'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35},
      {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}
                  ],         
                              }

assert extract_data(excerpt_2) == {
    'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
    'intentName': {'inform', 'greeting'},
    'entityLabels': [
        {
          'entityName': 'dst_city',
          'startCharIndex': 63,
          'endCharIndex': 74
          }]}

Let's extract data into the required json format:

In [11]:
all_data = [extract_data(conversation[0]) for conversation in tqdm(frames['turns'].values)]

  0%|          | 0/1369 [00:00<?, ?it/s]

What are the different intentName values?

In [12]:
intents = set()
for data in all_data:
    intents.update(data['intentName'])
intents    

{'greeting', 'inform', 'request', 'thankyou'}

Let's have a look at examples:

In [13]:

for data in all_data[10:15]:
    intent = data['intentName']
    print('\nINTENT:', intent)
    print('TEXT:', data['text'])
    print('NB OF ENTITY LABELS:', len(data['entityLabels']))



INTENT: {'inform'}
TEXT: I'd like to get away from Monday, August 15, 2016 to Wednesday, August 31, 2016. My budget is $3200 and I would leave from Detroit.
NB OF ENTITY LABELS: 4

INTENT: {'inform'}
TEXT: Good day, please book me a trip from Vancouver, Jamaica to Recife. I would like to leave for 17 days on August 24.
NB OF ENTITY LABELS: 4

INTENT: {'inform', 'greeting'}
TEXT: Hi im looking for a nice destination that i could go to from Columbus
NB OF ENTITY LABELS: 1

INTENT: {'inform'}
TEXT: Hi, I have a big family and we can't afford a big vacation, we want to go to Atlantis from Mos Eisley for $3600 at most.
NB OF ENTITY LABELS: 3

INTENT: {'inform'}
TEXT: Hi, I'm leaving from Diagon Alley and going to Coruscant on a budget of $3400. The dates don't matter, but I cannot afford anything over my budget.
NB OF ENTITY LABELS: 3


In [14]:
all_data_df = pd.DataFrame(all_data)
all_data_df

Unnamed: 0,text,intentName,entityLabels
0,I'd like to book a trip to Atlantis from Capri...,{inform},"[{'entityName': 'intent', 'startCharIndex': 12..."
1,"Hello, I am looking to book a vacation from Go...","{inform, greeting}","[{'entityName': 'intent', 'startCharIndex': 23..."
2,Hello there i am looking to go on a vacation w...,"{inform, greeting}","[{'entityName': 'dst_city', 'startCharIndex': ..."
3,"Hi I'd like to go to Caprica from Busan, betwe...","{inform, greeting}","[{'entityName': 'or_city', 'startCharIndex': 3..."
4,"Hello, I am looking to book a trip for 2 adult...","{inform, greeting}","[{'entityName': 'intent', 'startCharIndex': 23..."
...,...,...,...
1364,Hi I've got 9 days free and I'm looking for a ...,{inform},"[{'entityName': 'or_city', 'startCharIndex': 6..."
1365,I need to get to Fortaleza on September 8th or...,{inform},"[{'entityName': 'dst_city', 'startCharIndex': ..."
1366,We're finally going on vacation isn't that ama...,{inform},"[{'entityName': 'budget', 'startCharIndex': 75..."
1367,"Hi there, I'm looking for a place to get away ...",{inform},[]


There seems to be two kinds of "intents" in the data:
- global intent ("book"...)
- intents for the current conversation ("inform", "greeting"...), that are part of entities.

What are the most common intents for current conversation (ie entities intents)?

In [15]:
intents

{'greeting', 'inform', 'request', 'thankyou'}

Let's visualize these intents and the other entities as columns of our dataframe.

In [30]:
for intent in intents:
    all_data_df["intent_" + intent] = all_data_df["intentName"].apply(lambda x: 1 if intent in x else 0)

def entity_is_present(entity_name, entity_labels):
    for entity_label in entity_labels:
        if entity_label['entityName'] == entity_name:
            return True
    return False

for entity in entities:
    all_data_df["entity_"+ entity] = all_data_df["entityLabels"].apply(lambda x: 1 if entity_is_present(entity, x) else 0)    


So we have different intents, and also the entity "intent", which is confusing. What are these entities "intent" values?

In [32]:
def get_intent(text, entity_labels):
    for entity_label in entity_labels:
        if entity_label['entityName'] == 'intent':
            return text[entity_label['startCharIndex']:entity_label['endCharIndex']]
    return None
assert get_intent(*all_data_df.loc[0, ['text', 'entityLabels']].values) == 'book'
all_data_df['intent'] = all_data_df.apply(lambda x: get_intent(x['text'], x['entityLabels']), axis=1)

In [33]:
all_data_df['intent'].unique()

array(['book', None], dtype=object)

In [34]:
all_data_df[all_data_df['intent']!='book']

Unnamed: 0,text,intentName,entityLabels,intent_greeting,intent_thankyou,intent_inform,intent_request,entity_budget,entity_or_city,entity_dst_city,entity_str_date,entity_end_date,entity_intent,intent,total
2,Hello there i am looking to go on a vacation w...,"{inform, greeting}","[{'entityName': 'dst_city', 'startCharIndex': ...",1,0,1,0,0,0,1,0,0,0,,0
3,"Hi I'd like to go to Caprica from Busan, betwe...","{inform, greeting}","[{'entityName': 'or_city', 'startCharIndex': 3...",1,0,1,0,0,1,1,1,1,0,,0
5,"Hey, i Want to go to St. Louis on the 17th of ...","{inform, greeting}","[{'entityName': 'dst_city', 'startCharIndex': ...",1,0,1,0,0,0,1,1,0,0,,0
6,I'm looking for a trip to Gotham City leaving ...,{inform},"[{'entityName': 'budget', 'startCharIndex': 12...",0,0,1,0,1,1,1,1,0,0,,0
9,"Hi, I need to go to Mos Eisley for a wedding, ...",{inform},"[{'entityName': 'budget', 'startCharIndex': 14...",0,0,1,0,1,0,1,1,1,0,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,I have a CRUCIAL meeting with my investors in ...,{inform},"[{'entityName': 'dst_city', 'startCharIndex': ...",0,0,1,0,0,0,1,0,0,0,,0
1364,Hi I've got 9 days free and I'm looking for a ...,{inform},"[{'entityName': 'or_city', 'startCharIndex': 6...",0,0,1,0,0,1,0,0,0,0,,0
1365,I need to get to Fortaleza on September 8th or...,{inform},"[{'entityName': 'dst_city', 'startCharIndex': ...",0,0,1,0,0,0,1,1,0,0,,0
1366,We're finally going on vacation isn't that ama...,{inform},"[{'entityName': 'budget', 'startCharIndex': 75...",0,0,1,0,1,0,0,0,0,0,,0


We do not need all the utterances to train LUIS: according to the documentation, we can train LUIS on a subset of the utterances, provided that we have a good understanding of the intent. To do so, we'll use the intentName value to filter the utterances and keep those that convey most information: i.e. those where all the entities are present.

Let's extract which entities are present in the utterances:

In [35]:
entities_cols = [col for col in all_data_df.columns if col.startswith('entity_')]
all_data_df['total'] = all_data_df[entities_cols].sum(axis=1)

In [36]:
all_data_df

Unnamed: 0,text,intentName,entityLabels,intent_greeting,intent_thankyou,intent_inform,intent_request,entity_budget,entity_or_city,entity_dst_city,entity_str_date,entity_end_date,entity_intent,intent,total
0,I'd like to book a trip to Atlantis from Capri...,{inform},"[{'entityName': 'intent', 'startCharIndex': 12...",0,0,1,0,1,1,1,1,0,1,book,5
1,"Hello, I am looking to book a vacation from Go...","{inform, greeting}","[{'entityName': 'intent', 'startCharIndex': 23...",1,0,1,0,1,1,1,0,0,1,book,4
2,Hello there i am looking to go on a vacation w...,"{inform, greeting}","[{'entityName': 'dst_city', 'startCharIndex': ...",1,0,1,0,0,0,1,0,0,0,,1
3,"Hi I'd like to go to Caprica from Busan, betwe...","{inform, greeting}","[{'entityName': 'or_city', 'startCharIndex': 3...",1,0,1,0,0,1,1,1,1,0,,4
4,"Hello, I am looking to book a trip for 2 adult...","{inform, greeting}","[{'entityName': 'intent', 'startCharIndex': 23...",1,0,1,0,1,1,1,0,0,1,book,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,Hi I've got 9 days free and I'm looking for a ...,{inform},"[{'entityName': 'or_city', 'startCharIndex': 6...",0,0,1,0,0,1,0,0,0,0,,1
1365,I need to get to Fortaleza on September 8th or...,{inform},"[{'entityName': 'dst_city', 'startCharIndex': ...",0,0,1,0,0,0,1,1,0,0,,2
1366,We're finally going on vacation isn't that ama...,{inform},"[{'entityName': 'budget', 'startCharIndex': 75...",0,0,1,0,1,0,0,0,0,0,,1
1367,"Hi there, I'm looking for a place to get away ...",{inform},[],0,0,1,0,0,0,0,0,0,0,,0


In [38]:
complete_samples = all_data_df[all_data_df['total']>4]
complete_samples.shape

(48, 15)

In [None]:
complete_samples.head()

In [None]:
train_samples_df = complete_samples.sample(frac=0.5, random_state=42)
test_samples_df = complete_samples.drop(train_samples_df.index)

In [None]:
train_data = [all_data[i] for i in train_samples_df.index]
test_data = [all_data[i] for i in test_samples_df.index]

In [None]:
train_data[:2]

In [None]:
with open('train_data.json', "w") as f:
    json.dump(train_data, f, indent=4)

with open('test_data.json', "w") as f:
    json.dump(test_data, f, indent=4)

In [None]:
ask_for = {
    "str_date": "when do you want to go?",
    "end_date": "when do you want to come back?",
    "dst_city": "where do you want to fly to?",
    "or_city": "where do you want to depart from?",
    "budget": "what is your budget?",
    }
