# OC-IA-P10 - CHATBOT

# DATA EXPLORATION

We are given an existing dataset, more complete than what we need. Let's explore it and extract only the desired information.

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import json

In [3]:
frames = pd.read_json('frames/frames.json')
frames['turns']


0       [{'text': 'I'd like to book a trip to Atlantis...
1       [{'text': 'Hello, I am looking to book a vacat...
2       [{'text': 'Hello there i am looking to go on a...
3       [{'text': 'Hi I'd like to go to Caprica from B...
4       [{'text': 'Hello, I am looking to book a trip ...
                              ...                        
1364    [{'text': 'Hi I've got 9 days free and I'm loo...
1365    [{'text': 'I need to get to Fortaleza on Septe...
1366    [{'text': 'We're finally going on vacation isn...
1367    [{'text': 'Hi there, I'm looking for a place t...
1368    [{'text': 'I need to book a trip for the whole...
Name: turns, Length: 1369, dtype: object

Let's have a look at a conversation. For ease of reading the values are sent to a json file.

In [4]:
with open('sample.json', "w") as f:
    json.dump(frames.loc[0]['turns'], f, indent=4)

We'll use only user data (not wizard's answers), and among them we'll use the `text` and the `acts_without_refs` values. Let's have a look at the values we can find inside:

In [5]:
all_entities = set()
for conversation in tqdm(frames['turns'][:1]):
    for turn in conversation:
        for act in turn['labels']['acts']:
            for arg in act['args']:
                all_entities.add(arg['key'])
       
all_entities

100%|██████████| 1/1 [00:00<00:00, 11554.56it/s]


{'budget',
 'dst_city',
 'end_date',
 'flex',
 'intent',
 'n_adults',
 'or_city',
 'ref',
 'str_date'}


We'll specifically stick to the values we want to process in this MVP:
- departure city ('or_city')
- arrival city ('dest_city')
- departure date ('str_date')
- end date ('end_date')
- budget ('budget')

Note : a first pass on this part showed that extracting the "intent" entity is not interesting at this stage of our work : this intent can be "book" or "None", which is not useful for us.


Moreover, since in this MVP we do not want to manage any historical aspect, we'll only keep the first utterance of each conversation.

Example of json file format for training LUIS:

```Python
[
  {
    "text": "order a pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "order a large pepperoni pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 14,
        "endCharIndex": 28
      },
      {
        "entityName": "Size",
        "startCharIndex": 8,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "I want two large pepperoni pizzas on thin crust",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 17,
        "endCharIndex": 32
      },
      {
        "entityName": "Size",
        "startCharIndex": 11,
        "endCharIndex": 15
      },
      {
        "entityName": "Quantity",
        "startCharIndex": 7,
        "endCharIndex": 9
      },
      {
        "entityName": "Crust",
        "startCharIndex": 37,
        "endCharIndex": 46
      }
    ]
  }
]
```

In [6]:
entities = [
    'budget',
    'or_city',
    'dst_city',
    'str_date',
    'end_date',
 ]

def merge_elements(list_of_same_entities):
    """
    Merge entities of same category into one. E.g. (for the sentence
    'I want to go to Paris on the 5th of May'):

    list_of_same_entities = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 31,
        },
        {
            'entityName': 'str_date',
            'startCharIndex': 36,  # index of 'May'
            'endCharIndex': 38,
        },  
    ]
    output = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 38, # end index of 'May'
        },  
    ]   
        
    """
    list_of_same_entities = sorted(list_of_same_entities, key=lambda x: x['startCharIndex'])
    entity_name = list_of_same_entities[0]['entityName']
    start_index = list_of_same_entities[0]['startCharIndex']
    end_index = list_of_same_entities[-1]['endCharIndex']
    result = [{    
        'entityName': entity_name,
        'startCharIndex': start_index,
        'endCharIndex': end_index,
    }]
    return result 
    

def extract_args(text, arg, entities=entities):
    """
    Formats args of arg into a list of dicts as follows:
    [
        {
            'entityName': '...',
            'startCharIndex': ...,
            'endCharIndex': ...,
        },  
    ]
    If entities is provided (default behaviour), only those entities are returned.
    Otherwise, all entities are returned.
    """
    results = []
    for item in arg['args']:
        key = item['key']
        val = item['val']
        if key not in entities:
            continue
        if not (val and isinstance(val, str) and val in text):
            continue
        start_index = text.index(val)
        end_index = start_index + len(val)
        results.append({
            'entityName': key,
            'startCharIndex': start_index,
            'endCharIndex': end_index,  
        })
    
    # Some data, such as dates, may be split into multiple entities,
    # we need to merge them
    merged_results = []
    for entity in entities:
        elements_for_entity = [item for item in results 
                                if item['entityName'] == entity]
        if len(elements_for_entity) > 1:
            merged_results.extend(merge_elements(elements_for_entity))
        elif len(elements_for_entity) == 1:
            merged_results.extend(elements_for_entity)
    return merged_results   

    
def extract_intent_name(arg):
    return arg['name'] 


def extract_data(sentence):
    """
    Return selected data from sentence
    (a round of a counversation) in following format:
    {
        'intent': ['...'],
        'entities': [
            {
                'entityName': '...',
                'startCharIndex': ...,
                'endCharIndex': ...,
            },
        ],
    }
    """
    data = dict()
    text = sentence['text']
    args = sentence['labels']['acts_without_refs']
    entities = []
    data['text'] = text
    data['intentName'] = tuple(sorted(list(set([extract_intent_name(arg) for arg in args]))))
    for arg in args:
        entities.extend(extract_args(text, arg))
    data['entityLabels'] = entities
    return data

    

In [7]:
###############################################################################
#  DATA FOR TESTS
###############################################################################

text = "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."

arg_1 = {'args': [
                {'val': 'book', 'key': 'intent'}
                ], 
        'name': 'inform'}

arg_2 = {'args': [
                {'val': 'Caprica', 'key': 'or_city'}, 
                {'val': 'Saturday', 'key': 'str_date'}, 
                {'val': 'August', 'key': 'str_date'}, 
                {'val': '2016', 'key': 'str_date'}, 
                {'val': '8', 'key': 'adults'}, 
                {'val': '1700', 'key': 'budget'}
                ], 
        'name': 'request'}

excerpt_1 = {'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'budget': [{'val': '1700.0', 'negated': False}],
     'dst_city': [{'val': 'Atlantis', 'negated': False}],
     'or_city': [{'val': 'Caprica', 'negated': False}],
     'str_date': [{'val': 'august 13', 'negated': False}],
     'n_adults': [{'val': '8', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471272019730.0}

excerpt_2 = {'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'dst_city': [{'val': 'Gotham City', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471273579715.0}

###############################################################################
# TEST FUNCTIONS
###############################################################################

def test_merge_elements():
    assert merge_elements([
            {
                'entityName': 'str_date',
                'startCharIndex': 29,  # index of '5'
                'endCharIndex': 31,
            },
            {
                'entityName': 'str_date',
                'startCharIndex': 36,  # index of 'May'
                'endCharIndex': 38,
            },  
                        ]) == [
            {
                'entityName': 'str_date',
                'startCharIndex': 29,  # index of '5'
                'endCharIndex': 38, # end index of 'May'
            }   
                            ]

def test_extract_args_unique_args():
    assert extract_args(text, arg_1) == [] # the only entity is the intent, that we don't want to extract

def test_extract_args_multiple_args():
    assert extract_args(text, arg_2) == [
                {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
                {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
                {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}]


def test_extract_intent_name():
  assert extract_intent_name(excerpt_1['labels']['acts'][0]) == 'inform'
  assert extract_intent_name(excerpt_2['labels']['acts'][0]) == 'inform'

def test_extract_data():
  assert extract_data(excerpt_1) == {

    'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
    'intentName': ('inform',),
    'entityLabels': [

        {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
        {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
        {'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35},
        {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}
                    ],         
                                }
  assert extract_data(excerpt_2) == {
      'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
      'intentName': ('greeting', 'inform'),
      'entityLabels': [
          {
            'entityName': 'dst_city',
            'startCharIndex': 63,
            'endCharIndex': 74
            }]}
            
###############################################################################
# RUN TESTS
###############################################################################

err_counter = 0
for test_func in [
test_merge_elements,
test_extract_args_multiple_args,
test_extract_args_unique_args,
test_extract_intent_name,
test_extract_data,
]:
    try:
        test_func()
    except AssertionError:
        print('Test failed: {}'.format(test_func.__name__))
        err_counter += 1
if err_counter == 0:
    print('All tests passed!')        


All tests passed!


Let's extract data into the required json format:

In [8]:
all_data = [extract_data(conversation[0]) for conversation in tqdm(frames['turns'].values)]

100%|██████████| 1369/1369 [00:00<00:00, 144126.56it/s]


What are the different intentName values?

In [9]:
intents = set()
for data in all_data:
    intents.update(data['intentName'])
intents    

{'greeting', 'inform', 'request', 'thankyou'}

Let's have a look at examples:

In [10]:
all_data_df = pd.DataFrame(all_data)
all_data_df

Unnamed: 0,text,intentName,entityLabels
0,I'd like to book a trip to Atlantis from Capri...,"(inform,)","[{'entityName': 'budget', 'startCharIndex': 11..."
1,"Hello, I am looking to book a vacation from Go...","(greeting, inform)","[{'entityName': 'budget', 'startCharIndex': 75..."
2,Hello there i am looking to go on a vacation w...,"(greeting, inform)","[{'entityName': 'dst_city', 'startCharIndex': ..."
3,"Hi I'd like to go to Caprica from Busan, betwe...","(greeting, inform)","[{'entityName': 'or_city', 'startCharIndex': 3..."
4,"Hello, I am looking to book a trip for 2 adult...","(greeting, inform)","[{'entityName': 'budget', 'startCharIndex': 67..."
...,...,...,...
1364,Hi I've got 9 days free and I'm looking for a ...,"(inform,)","[{'entityName': 'or_city', 'startCharIndex': 6..."
1365,I need to get to Fortaleza on September 8th or...,"(inform,)","[{'entityName': 'dst_city', 'startCharIndex': ..."
1366,We're finally going on vacation isn't that ama...,"(inform,)","[{'entityName': 'budget', 'startCharIndex': 75..."
1367,"Hi there, I'm looking for a place to get away ...","(inform,)",[]


What are the values in intentName?

In [11]:
intentName = all_data_df['intentName'].apply(lambda x: tuple(x)).unique()
for item in intentName:
    print(item)

('inform',)
('greeting', 'inform')
('greeting', 'inform', 'request')
('greeting',)
('inform', 'request')
('greeting', 'inform', 'thankyou')
()


Which utterances have empty intentName?

In [12]:
all_data_df[all_data_df['intentName'].apply(lambda x: tuple(x))==()]

Unnamed: 0,text,intentName,entityLabels
526,"Have you ever read the book ""Vernon's Travels""?",(),[]
657,psssstttttt,(),[]
1158,Vacay time woooohooooooo,(),[]


Each utterance should have only one intent. Are the multi-intent common?

In [13]:
all_data_df.groupby('intentName').size()

intentName
()                                3
(greeting,)                     102
(greeting, inform)              297
(greeting, inform, request)       5
(greeting, inform, thankyou)      1
(inform,)                       951
(inform, request)                10
dtype: int64

There are enough sample with only the inform intent, let's keep only those ones.

In [14]:
inform_df = all_data_df[all_data_df['intentName']==('inform',)]
inform_df['intentName']='inform'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inform_df['intentName']='inform'


We'll manually create samples for greeting, thanks, and validation.

Moreover, we do not need all the utterances to train LUIS: according to the documentation, we can train LUIS iteratively on a first small subset of the utterances, provided that we have a good understanding of the intent. To do so, we'll use the utterances with most entities.

In [15]:
inform_df

Unnamed: 0,text,intentName,entityLabels
0,I'd like to book a trip to Atlantis from Capri...,inform,"[{'entityName': 'budget', 'startCharIndex': 11..."
6,I'm looking for a trip to Gotham City leaving ...,inform,"[{'entityName': 'budget', 'startCharIndex': 12..."
9,"Hi, I need to go to Mos Eisley for a wedding, ...",inform,"[{'entityName': 'budget', 'startCharIndex': 14..."
10,"I'd like to get away from Monday, August 15, 2...",inform,"[{'entityName': 'budget', 'startCharIndex': 94..."
11,"Good day, please book me a trip from Vancouver...",inform,"[{'entityName': 'or_city', 'startCharIndex': 3..."
...,...,...,...
1364,Hi I've got 9 days free and I'm looking for a ...,inform,"[{'entityName': 'or_city', 'startCharIndex': 6..."
1365,I need to get to Fortaleza on September 8th or...,inform,"[{'entityName': 'dst_city', 'startCharIndex': ..."
1366,We're finally going on vacation isn't that ama...,inform,"[{'entityName': 'budget', 'startCharIndex': 75..."
1367,"Hi there, I'm looking for a place to get away ...",inform,[]


In [16]:
# Get samples with more than 3 entities in the utterance
inform_df['nb_entities'] = inform_df['entityLabels'].apply(lambda x: len(x))
samples_df = inform_df[inform_df['nb_entities']>3]
samples_df  = samples_df.drop(['nb_entities'], axis=1)

# Convert to train_data and test_data lists of dicts
train_samples_df = samples_df.sample(frac=0.5, random_state=42)
test_samples_df = samples_df.drop(train_samples_df.index)
train_data = [train_samples_df.loc[i].to_dict() for i in train_samples_df.index]
test_data = [test_samples_df.loc[i].to_dict() for i in test_samples_df.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inform_df['nb_entities'] = inform_df['entityLabels'].apply(lambda x: len(x))


In [17]:
len(train_data), len(test_data)

(64, 63)

In [18]:
single_sample = train_data[0]
print('Training sample:')
for key, value in single_sample.items():
    print('"{}": "{}"'.format(key, value))
print()
print('Underlying entities:')

for entity_label in single_sample['entityLabels']:
    print(entity_label['entityName'],
    ':',
    single_sample['text'][entity_label['startCharIndex']:entity_label['endCharIndex']])

Training sample:
"text": "hello, please find me a vacation between saturday august 27 2016 and wednesday september 7 2016 for under 6000$ leaving from madrid"
"intentName": "inform"
"entityLabels": "[{'entityName': 'budget', 'startCharIndex': 106, 'endCharIndex': 111}, {'entityName': 'or_city', 'startCharIndex': 125, 'endCharIndex': 131}, {'entityName': 'str_date', 'startCharIndex': 41, 'endCharIndex': 64}, {'entityName': 'end_date', 'startCharIndex': 69, 'endCharIndex': 95}]"

Underlying entities:
budget : 6000$
or_city : madrid
str_date : saturday august 27 2016
end_date : wednesday september 7 2016


In [19]:
with open('train_data.json', "w") as f:
    json.dump(train_data, f, indent=4)

with open('test_data.json', "w") as f:
    json.dump(test_data, f, indent=4)

In [20]:
ask_for = {
    "str_date": "when do you want to go?",
    "end_date": "when do you want to come back?",
    "dst_city": "where do you want to fly to?",
    "or_city": "where do you want to depart from?",
    "budget": "what is your budget?",
    }


# Training LUIS

In [21]:
from azure.cognitiveservices.language.luis.authoring import LUISAuthoringClient
from azure.cognitiveservices.language.luis.authoring.models import ApplicationCreateObject
from azure.cognitiveservices.language.luis.runtime import LUISRuntimeClient
from msrest.authentication import CognitiveServicesCredentials
from functools import reduce

import json, time
import params

        
authoringKey = params.authoringKey
authoringEndpoint = params.authoringEndpoint
predictionKey = params.predictionKey
predictionEndpoint = params.predictionEndpoint

appName = "Booking recognition"
versionId = "0.1"

client = LUISAuthoringClient(authoringEndpoint, 
                        CognitiveServicesCredentials(authoringKey))

# define app basics
appDefinition = ApplicationCreateObject(
    name=appName, 
    initial_version_id=versionId, 
    culture='en-us')


In [24]:
# access or create app
try:
    app_id = client.apps.add(appDefinition)
    print(f"Created LUIS app with ID {app_id}")
except:
    print(f"App with name {appName} already exists.")
    app_id = client.apps.list()[0].id
    print(f"Using existing app with ID {app_id}")


Created LUIS app with ID b36ecefe-bcd8-4f25-b4a9-fd6e448a44f2


In [25]:
# create intents
for intent in ['inform', 'greeting', 'thankyou', 'confirm']:
    try:
        client.model.add_intent(app_id, versionId, intent)
        print(f"Created intent '{intent}'.")
    except:
        print(f"Intent '{intent}' already exists.")


Created intent 'inform'
Created intent 'greeting'
Created intent 'thankyou'
Created intent 'confirm'


In [26]:
# define machine-learned entities             
entities = [
        "budget",
        "or_city",
        "dst_city",
        "str_date",
        "end_date",
    ]

# add entities to app
for entity in entities:
    try:
        client.model.add_entity(app_id, 
                                versionId, 
                                name=entity) 
        print(f"Created entity '{entity}'.")
    except:
        print(f"Entity '{entity}' already exists.")

Created entity 'budget'.
Created entity 'or_city'.
Created entity 'dst_city'.
Created entity 'str_date'.
Created entity 'end_date'.


In [27]:
# Add utterances examples
for labeled_utterance in tqdm(train_data):
    client.examples.add(app_id, versionId, labeled_utterance)

100%|██████████| 64/64 [00:17<00:00,  3.65it/s]


In [28]:
# Train app on utterances
client.train.train_version(app_id, versionId)

# Check if training is done
waiting = True
while waiting:
    info = client.train.get_status(app_id, versionId)

    # get_status returns a list of training statuses, one for each model. Loop through them and make sure all are done.
    waiting = any(map(lambda x: 'Queued' == x.details.status or 'InProgress' == x.details.status, info))
    if waiting:
        print ("Waiting 10 seconds for training to complete...")
        time.sleep(10)
    else: 
        print ("trained")
        waiting = False

Waiting 10 seconds for training to complete...
trained


Now let's manually add utterances for other intents.


In [44]:
entities_terms = {
'greeting': ["hi", "hello", "hola", "good morning","good evening", "good afternoon", "good night"],
'thankyou': ["thank you", "thanks"], 
'confirm': ["yes", "OK", "true", "indeed", "exactly", "perfect", "that's it", "absolutely"]
}

entities_list = [{"canonicalForm": key, "list": value} for key, value in entities_terms.items()]