# OC-IA-P10 DATA EXPLORATION

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import json

In [3]:
frames = pd.read_json('frames/frames.json')
frames['turns']


0       [{'text': 'I'd like to book a trip to Atlantis...
1       [{'text': 'Hello, I am looking to book a vacat...
2       [{'text': 'Hello there i am looking to go on a...
3       [{'text': 'Hi I'd like to go to Caprica from B...
4       [{'text': 'Hello, I am looking to book a trip ...
                              ...                        
1364    [{'text': 'Hi I've got 9 days free and I'm loo...
1365    [{'text': 'I need to get to Fortaleza on Septe...
1366    [{'text': 'We're finally going on vacation isn...
1367    [{'text': 'Hi there, I'm looking for a place t...
1368    [{'text': 'I need to book a trip for the whole...
Name: turns, Length: 1369, dtype: object

Let's have a look at a conversation. For ease of reading the values are sent to a json file.

In [4]:
with open('sample.json', "w") as f:
    json.dump(frames.loc[0]['turns'], f, indent=4)

We'll use only user data (not wizard's answers), and among them we'll use the text and the acts_without_refs values.

We'ell specifically stick to the values we want to process in this MVP:
- departure city ('or_city')
- arrival city ('dest_city')
- departure date ('str_date')


In [5]:
all_entities = set()
for conversation in tqdm(frames['turns'][:1]):
    for turn in conversation:
        for act in turn['labels']['acts']:
            for arg in act['args']:
                all_entities.add(arg['key'])

            

  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
all_entities

{'budget',
 'dst_city',
 'end_date',
 'flex',
 'intent',
 'n_adults',
 'or_city',
 'ref',
 'str_date'}

In [16]:
entities = [
    'budget',
    'or_city',
    'dst_city',
    'str_date'
    'end_date',
    'intent',
 ]


In [18]:
all_data = []
for conversation in tqdm(frames['turns']):
    for turn in conversation:
        if turn['author']=='user':
            data = dict.fromkeys(['text', 'intentName', 'entityLabels'])
            data['text'] = turn['text']
            entity_labels = []
            for act in turn['labels']['acts_without_refs']:
                if 'name' in act:
                    intent_name = act['name']
                else:
                    intent_name = 'None'
                for item in act['args']:
                    entity_name = item['key']
                    if entity_name in entities:
                        entity_value = item['val']
                        if entity_value and entity_value in turn['text']:
                            start_char_index = turn['text'].index(entity_value)
                            end_char_index = start_char_index + len(entity_value)
                            entity_labels.append([{'entityName': entity_name, 
                                                'startCharIndex': start_char_index, 
                                                'endCharIndex': end_char_index}])
            data['intentName'] = intent_name
            data['entityLabels'] = entity_labels
            all_data.append(data)

  0%|          | 0/1369 [00:00<?, ?it/s]

In [19]:
all_data

[{'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
  'intentName': 'inform',
  'entityLabels': [[{'entityName': 'intent',
     'startCharIndex': 12,
     'endCharIndex': 16}],
   [{'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35}],
   [{'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48}],
   [{'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121}]]},
 {'text': 'Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900.',
  'intentName': 'inform',
  'entityLabels': [[{'entityName': 'dst_city',
     'startCharIndex': 24,
     'endCharIndex': 33}],
   [{'entityName': 'budget', 'startCharIndex': 114, 'endCharIndex': 118}],
   [{'entityName': 'or_city', 'startCharIndex': 39, 'endCharIndex': 46}]]},
 {'text': 'I have no flexibility for dates... but I can leave from Atlantis rather than Caprica. How a

Example of json file for training:

In [10]:
[
  {
    "text": "order a pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "order a large pepperoni pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 14,
        "endCharIndex": 28
      },
      {
        "entityName": "Size",
        "startCharIndex": 8,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "I want two large pepperoni pizzas on thin crust",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 17,
        "endCharIndex": 32
      },
      {
        "entityName": "Size",
        "startCharIndex": 11,
        "endCharIndex": 15
      },
      {
        "entityName": "Quantity",
        "startCharIndex": 7,
        "endCharIndex": 9
      },
      {
        "entityName": "Crust",
        "startCharIndex": 37,
        "endCharIndex": 46
      }
    ]
  }
]

[{'text': 'order a pizza',
  'intentName': 'ModifyOrder',
  'entityLabels': [{'entityName': 'Order',
    'startCharIndex': 6,
    'endCharIndex': 12}]},
 {'text': 'order a large pepperoni pizza',
  'intentName': 'ModifyOrder',
  'entityLabels': [{'entityName': 'Order',
    'startCharIndex': 6,
    'endCharIndex': 28},
   {'entityName': 'FullPizzaWithModifiers',
    'startCharIndex': 6,
    'endCharIndex': 28},
   {'entityName': 'PizzaType', 'startCharIndex': 14, 'endCharIndex': 28},
   {'entityName': 'Size', 'startCharIndex': 8, 'endCharIndex': 12}]},
 {'text': 'I want two large pepperoni pizzas on thin crust',
  'intentName': 'ModifyOrder',
  'entityLabels': [{'entityName': 'Order',
    'startCharIndex': 7,
    'endCharIndex': 46},
   {'entityName': 'FullPizzaWithModifiers',
    'startCharIndex': 7,
    'endCharIndex': 46},
   {'entityName': 'PizzaType', 'startCharIndex': 17, 'endCharIndex': 32},
   {'entityName': 'Size', 'startCharIndex': 11, 'endCharIndex': 15},
   {'entityName': 'Q

Let's transpose our data into the required format.