# OC-IA-P10 - CHATBOT

# DATA PREPARATION

We are given an existing dataset, more complete than what we need. Let's explore it and extract only the desired information.

In [1]:
# import sys
# !{sys.executable} -m pip install -r requirements.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
import json

In [3]:
frames = pd.read_json('frames/frames.json')
frames['turns']


0       [{'text': 'I'd like to book a trip to Atlantis...
1       [{'text': 'Hello, I am looking to book a vacat...
2       [{'text': 'Hello there i am looking to go on a...
3       [{'text': 'Hi I'd like to go to Caprica from B...
4       [{'text': 'Hello, I am looking to book a trip ...
                              ...                        
1364    [{'text': 'Hi I've got 9 days free and I'm loo...
1365    [{'text': 'I need to get to Fortaleza on Septe...
1366    [{'text': 'We're finally going on vacation isn...
1367    [{'text': 'Hi there, I'm looking for a place t...
1368    [{'text': 'I need to book a trip for the whole...
Name: turns, Length: 1369, dtype: object

Let's have a look at a conversation. For ease of reading the values are sent to a json file.

In [4]:
with open('sample.json', "w") as f:
    json.dump(frames.loc[0]['turns'], f, indent=4)

We'll use only user data (not wizard's answers), and among them we'll use the `text` and the `acts_without_refs` values. Let's have a look at the values we can find inside:

In [5]:
all_entities = set()
for conversation in tqdm(frames['turns'][:1]):
    for turn in conversation:
        for act in turn['labels']['acts']:
            for arg in act['args']:
                all_entities.add(arg['key'])
       
all_entities

100%|██████████| 1/1 [00:00<00:00, 13400.33it/s]


{'budget',
 'dst_city',
 'end_date',
 'flex',
 'intent',
 'n_adults',
 'or_city',
 'ref',
 'str_date'}


We'll specifically stick to the values we want to process in this MVP:
- departure city ('or_city')
- arrival city ('dest_city')
- departure date ('str_date')
- end date ('end_date')
- budget ('budget')

Note : a first pass on this part showed that extracting the "intent" entity is not interesting at this stage of our work : this intent can be "book" or "None", which is not useful for us.


Moreover, since in this MVP we do not want to manage any historical aspect, we'll only keep the first utterance of each conversation.

Example of json file format for training LUIS:

```Python
[
  {
    "text": "order a pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "order a large pepperoni pizza",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 6,
        "endCharIndex": 28
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 14,
        "endCharIndex": 28
      },
      {
        "entityName": "Size",
        "startCharIndex": 8,
        "endCharIndex": 12
      }
    ]
  },
  {
    "text": "I want two large pepperoni pizzas on thin crust",
    "intentName": "ModifyOrder",
    "entityLabels": [
      {
        "entityName": "Order",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "FullPizzaWithModifiers",
        "startCharIndex": 7,
        "endCharIndex": 46
      },
      {
        "entityName": "PizzaType",
        "startCharIndex": 17,
        "endCharIndex": 32
      },
      {
        "entityName": "Size",
        "startCharIndex": 11,
        "endCharIndex": 15
      },
      {
        "entityName": "Quantity",
        "startCharIndex": 7,
        "endCharIndex": 9
      },
      {
        "entityName": "Crust",
        "startCharIndex": 37,
        "endCharIndex": 46
      }
    ]
  }
]
```

In [6]:
entities = [
    'budget',
    'or_city',
    'dst_city',
    'str_date',
    'end_date',
 ]

def merge_elements(list_of_same_entities):
    """
    Merge entities of same category into one. E.g. (for the sentence
    'I want to go to Paris on the 5th of May'):

    list_of_same_entities = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 31,
        },
        {
            'entityName': 'str_date',
            'startCharIndex': 36,  # index of 'May'
            'endCharIndex': 38,
        },  
    ]
    output = [
        {
            'entityName': 'str_date',
            'startCharIndex': 29,  # index of '5'
            'endCharIndex': 38, # end index of 'May'
        },  
    ]   
        
    """
    list_of_same_entities = sorted(list_of_same_entities, key=lambda x: x['startCharIndex'])
    entity_name = list_of_same_entities[0]['entityName']
    start_index = list_of_same_entities[0]['startCharIndex']
    end_index = list_of_same_entities[-1]['endCharIndex']
    result = [{    
        'entityName': entity_name,
        'startCharIndex': start_index,
        'endCharIndex': end_index,
    }]
    return result 
    

def extract_args(text, arg, entities=entities):
    """
    Formats args of arg into a list of dicts as follows:
    [
        {
            'entityName': '...',
            'startCharIndex': ...,
            'endCharIndex': ...,
        },  
    ]
    If entities is provided (default behaviour), only those entities are returned.
    Otherwise, all entities are returned.
    """
    results = []
    for item in arg['args']:
        key = item['key']
        val = item['val']
        if key not in entities:
            continue
        if not (val and isinstance(val, str) and val in text):
            continue
        start_index = text.index(val)
        end_index = start_index + len(val)
        results.append({
            'entityName': key,
            'startCharIndex': start_index,
            'endCharIndex': end_index,  
        })
    
    # Some data, such as dates, may be split into multiple entities,
    # we need to merge them
    merged_results = []
    for entity in entities:
        elements_for_entity = [item for item in results 
                                if item['entityName'] == entity]
        if len(elements_for_entity) > 1:
            merged_results.extend(merge_elements(elements_for_entity))
        elif len(elements_for_entity) == 1:
            merged_results.extend(elements_for_entity)
    return merged_results   

    
def extract_intent_name(arg):
    return arg['name'] 


def extract_data(sentence):
    """
    Return selected data from sentence
    (a round of a counversation) in following format:
    {
        'intent': ['...'],
        'entities': [
            {
                'entityName': '...',
                'startCharIndex': ...,
                'endCharIndex': ...,
            },
        ],
    }
    """
    data = dict()
    text = sentence['text']
    args = sentence['labels']['acts_without_refs']
    entities = []
    data['text'] = text
    data['intentName'] = tuple(sorted(list(set([extract_intent_name(arg) for arg in args]))))
    for arg in args:
        entities.extend(extract_args(text, arg))
    data['entityLabels'] = entities
    return data

    

In [7]:
###############################################################################
#  DATA FOR TESTS
###############################################################################

text = "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."

arg_1 = {'args': [
                {'val': 'book', 'key': 'intent'}
                ], 
        'name': 'inform'}

arg_2 = {'args': [
                {'val': 'Caprica', 'key': 'or_city'}, 
                {'val': 'Saturday', 'key': 'str_date'}, 
                {'val': 'August', 'key': 'str_date'}, 
                {'val': '2016', 'key': 'str_date'}, 
                {'val': '8', 'key': 'adults'}, 
                {'val': '1700', 'key': 'budget'}
                ], 
        'name': 'request'}

excerpt_1 = {'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Atlantis', 'key': 'dst_city'},
     {'val': 'Caprica', 'key': 'or_city'},
     {'val': 'Saturday, August 13, 2016', 'key': 'str_date'},
     {'val': '8', 'key': 'n_adults'},
     {'val': '1700', 'key': 'budget'}],
    'name': 'inform'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'budget': [{'val': '1700.0', 'negated': False}],
     'dst_city': [{'val': 'Atlantis', 'negated': False}],
     'or_city': [{'val': 'Caprica', 'negated': False}],
     'str_date': [{'val': 'august 13', 'negated': False}],
     'n_adults': [{'val': '8', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471272019730.0}

excerpt_2 = {'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
 'labels': {'acts': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'acts_without_refs': [{'args': [{'val': 'book', 'key': 'intent'}],
    'name': 'inform'},
   {'args': [{'val': 'Gotham City', 'key': 'dst_city'}], 'name': 'inform'},
   {'args': [], 'name': 'greeting'}],
  'active_frame': 1,
  'frames': [{'info': {'intent': [{'val': 'book', 'negated': False}],
     'dst_city': [{'val': 'Gotham City', 'negated': False}]},
    'frame_id': 1,
    'requests': [],
    'frame_parent_id': None,
    'binary_questions': [],
    'compare_requests': []}]},
 'author': 'user',
 'timestamp': 1471273579715.0}

###############################################################################
# TEST FUNCTIONS
###############################################################################

def test_merge_elements():
    assert merge_elements([
            {
                'entityName': 'str_date',
                'startCharIndex': 29,  # index of '5'
                'endCharIndex': 31,
            },
            {
                'entityName': 'str_date',
                'startCharIndex': 36,  # index of 'May'
                'endCharIndex': 38,
            },  
                        ]) == [
            {
                'entityName': 'str_date',
                'startCharIndex': 29,  # index of '5'
                'endCharIndex': 38, # end index of 'May'
            }   
                            ]

def test_extract_args_unique_args():
    assert extract_args(text, arg_1) == [] # the only entity is the intent, that we don't want to extract

def test_extract_args_multiple_args():
    assert extract_args(text, arg_2) == [
                {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
                {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
                {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}]


def test_extract_intent_name():
  assert extract_intent_name(excerpt_1['labels']['acts'][0]) == 'inform'
  assert extract_intent_name(excerpt_2['labels']['acts'][0]) == 'inform'

def test_extract_data():
  assert extract_data(excerpt_1) == {

    'text': "I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700.",
    'intentName': ('inform',),
    'entityLabels': [

        {'entityName': 'budget', 'startCharIndex': 117, 'endCharIndex': 121},
        {'entityName': 'or_city', 'startCharIndex': 41, 'endCharIndex': 48},
        {'entityName': 'dst_city', 'startCharIndex': 27, 'endCharIndex': 35},
        {'entityName': 'str_date', 'startCharIndex': 52, 'endCharIndex': 77}
                    ],         
                                }
  assert extract_data(excerpt_2) == {
      'text': 'Hello there i am looking to go on a vacation with my family to Gotham City, can you help me?',
      'intentName': ('greeting', 'inform'),
      'entityLabels': [
          {
            'entityName': 'dst_city',
            'startCharIndex': 63,
            'endCharIndex': 74
            }]}
            
###############################################################################
# RUN TESTS
###############################################################################

err_counter = 0
for test_func in [
test_merge_elements,
test_extract_args_multiple_args,
test_extract_args_unique_args,
test_extract_intent_name,
test_extract_data,
]:
    try:
        test_func()
    except AssertionError:
        print('Test failed: {}'.format(test_func.__name__))
        err_counter += 1
if err_counter == 0:
    print('All tests passed!')        


All tests passed!


Let's extract data into the required json format:

In [8]:
all_data = [extract_data(conversation[0]) for conversation in tqdm(frames['turns'].values)]

100%|██████████| 1369/1369 [00:00<00:00, 83707.54it/s]


What are the different intentName values?

In [9]:
intents = set()
for data in all_data:
    intents.update(data['intentName'])
intents    

{'greeting', 'inform', 'request', 'thankyou'}

Let's have a look at examples:

In [10]:
all_data_df = pd.DataFrame(all_data)
all_data_df

Unnamed: 0,text,intentName,entityLabels
0,I'd like to book a trip to Atlantis from Capri...,"(inform,)","[{'entityName': 'budget', 'startCharIndex': 11..."
1,"Hello, I am looking to book a vacation from Go...","(greeting, inform)","[{'entityName': 'budget', 'startCharIndex': 75..."
2,Hello there i am looking to go on a vacation w...,"(greeting, inform)","[{'entityName': 'dst_city', 'startCharIndex': ..."
3,"Hi I'd like to go to Caprica from Busan, betwe...","(greeting, inform)","[{'entityName': 'or_city', 'startCharIndex': 3..."
4,"Hello, I am looking to book a trip for 2 adult...","(greeting, inform)","[{'entityName': 'budget', 'startCharIndex': 67..."
...,...,...,...
1364,Hi I've got 9 days free and I'm looking for a ...,"(inform,)","[{'entityName': 'or_city', 'startCharIndex': 6..."
1365,I need to get to Fortaleza on September 8th or...,"(inform,)","[{'entityName': 'dst_city', 'startCharIndex': ..."
1366,We're finally going on vacation isn't that ama...,"(inform,)","[{'entityName': 'budget', 'startCharIndex': 75..."
1367,"Hi there, I'm looking for a place to get away ...","(inform,)",[]


What are the values in intentName?

In [11]:
intentName = all_data_df['intentName'].apply(lambda x: tuple(x)).unique()
for item in intentName:
    print(item)

('inform',)
('greeting', 'inform')
('greeting', 'inform', 'request')
('greeting',)
('inform', 'request')
('greeting', 'inform', 'thankyou')
()


Which utterances have empty intentName?

In [12]:
all_data_df[all_data_df['intentName'].apply(lambda x: tuple(x))==()]

Unnamed: 0,text,intentName,entityLabels
526,"Have you ever read the book ""Vernon's Travels""?",(),[]
657,psssstttttt,(),[]
1158,Vacay time woooohooooooo,(),[]


Each utterance should have only one intent. Are the multi-intent common?

In [13]:
all_data_df.groupby('intentName').size()

intentName
()                                3
(greeting,)                     102
(greeting, inform)              297
(greeting, inform, request)       5
(greeting, inform, thankyou)      1
(inform,)                       951
(inform, request)                10
dtype: int64

In [14]:
all_data_df[all_data_df['intentName']==('greeting',)]

Unnamed: 0,text,intentName,entityLabels
40,Hi!,"(greeting,)",[]
48,Heyo!,"(greeting,)",[]
52,Good morning.,"(greeting,)",[]
63,Hello wozbot!,"(greeting,)",[]
106,ay whats up?,"(greeting,)",[]
...,...,...,...
1165,Hi. First time trying this out. What do I do?,"(greeting,)",[]
1170,hi,"(greeting,)",[]
1223,Hi,"(greeting,)",[]
1251,Hi,"(greeting,)",[]


There are enough sample with only the `inform` intent and only the `greeting` intent, let's keep only those ones. We'll build samples for the other intents `agree` and `disagree` that are there to check if the bot correctly understood the user.

In [15]:
agree_utterances = [
"Yes, that's right",
"Exactly",
"I confirm",
"Yes",
"Absolutely",
"Yeah, definitely! So happy!",
"That's good, I'm OK with it",
"Yes, what's the next step?",
"OK, what do you propose?",
"Perfect",
"Sounds good to me",
"Yes, I think it's OK",
"Oh yeah, bring it on!",
"Mmm, I think so",
"Let's say yes...",
"Yes, go on",
"OK",
"That's OK",
"Yes!",
"All good",
"OK for me",
"Let's go!",
"You rock",
"Many thanks",
]

disagree_utterances = [
    "No",
    "Not at all",
    "Not really...",
    "Well, no",
    "What?",
    "Absolutely not",
    "This is wrong",
    "I didn't say that",
    "No! No!",
    "That's bullshit!",
    "Really...?",
    "Are you sure?",
    "Mmmm, I'm afraid not",
    "You completely misunderstood",
    "Is it the best you can do?"
    "I'm disappointed",
    "You're garbage",
    "As a matter of fact, no",
    "You're wrong",
    "You got it wrong",
    "That sucks",
    "I want to chat with a real person please",
    "Damn machine!",
    "I'm not sure",
    "I don't know",
    "I don't think so",
    ]

In [16]:
# Select utterances with intent 'inform' only
inform_df = all_data_df[all_data_df['intentName'] ==('inform',)]
inform_df['intentName'] ='inform'

# Select utterances with intent 'greeting' only
greeting_df = all_data_df[all_data_df['intentName'] == ('greeting',)]
greeting_df['intentName'] = 'greeting'
greeting_df = greeting_df.drop_duplicates(subset=['text'])

# Create dataframe with utterances for intent 'agree' (no entity needed)
agree_df = pd.DataFrame(columns=all_data_df.columns)
agree_df['text'] = agree_utterances
agree_df['intentName'] = "agree"
agree_df['entityLabels'] = np.empty((len(agree_df), 0)).tolist()

# Create dataframe with utterances for intent 'disagree' (no entity needed)
disagree_df = pd.DataFrame(columns=all_data_df.columns)
disagree_df['text'] = disagree_utterances
disagree_df['intentName'] = "disagree"
disagree_df['entityLabels'] = np.empty((len(disagree_df), 0)).tolist()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inform_df['intentName'] ='inform'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  greeting_df['intentName'] = 'greeting'


In [17]:
inform_df.shape, greeting_df.shape, agree_df.shape, disagree_df.shape

((951, 3), (64, 3), (24, 3), (25, 3))

We don't need all the utterances to train LUIS: according to the documentation, we can train LUIS iteratively on a first small subset of the utterances, provided that we have a good understanding of the intent. To do so, we'll use the `inform` utterances with most entities.

In [18]:
# Get samples with more than 3 entities in the utterance
inform_df['nb_entities'] = inform_df['entityLabels'].apply(lambda x: len(x))
inform_df = inform_df[inform_df['nb_entities']>3]
inform_df  = inform_df.drop(['nb_entities'], axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inform_df['nb_entities'] = inform_df['entityLabels'].apply(lambda x: len(x))


In [19]:
inform_df.shape

(127, 3)

In [21]:
# Create train and test dataframes (50% train, 50% test)
frac=0.5
random_state = 42

train_inform_df = inform_df.sample(frac=frac, random_state=random_state)
train_greeting_df = greeting_df.sample(frac=frac, random_state=random_state)
train_agree_df = agree_df.sample(frac=frac, random_state=random_state)
train_disagree_df = disagree_df.sample(frac=frac, random_state=random_state)

test_inform_df = inform_df.drop(train_inform_df.index)
test_greeting_df = greeting_df.drop(train_greeting_df.index)
test_agree_df = agree_df.drop(train_agree_df.index)
test_disagree_df = disagree_df.drop(train_disagree_df.index)

# Convert train an test dataframes to list of dictionaries
train_inform = [train_inform_df.loc[i].to_dict() for i in train_inform_df.index]
train_greeting = [train_greeting_df.loc[i].to_dict() for i in train_greeting_df.index]
train_agree = [train_agree_df.loc[i].to_dict() for i in train_agree_df.index]
train_disagree = [train_disagree_df.loc[i].to_dict() for i in train_disagree_df.index]

test_inform = [test_inform_df.loc[i].to_dict() for i in test_inform_df.index]
test_greeting = [test_greeting_df.loc[i].to_dict() for i in test_greeting_df.index]
test_agree = [test_agree_df.loc[i].to_dict() for i in test_agree_df.index]
test_disagree = [test_disagree_df.loc[i].to_dict() for i in test_disagree_df.index]


Test data must follow a different structure than train data:

In [None]:
def change_keys(entity):
    """Return entity with modified keys to comply to test data format.
    E.g.:
    input:
    {'entityName': 'budget', 'startCharIndex': 0, 'endCharIndex': 5}

    output:
    {'entity': 'budget', 'startPos': 0, 'endPos': 5}
    """
    return {'entity': entity['entityName'], 'startPos': entity['startCharIndex'], 'endPos': entity['endCharIndex']}

test_data_df = pd.concat([test_inform_df, test_greeting_df, test_agree_df, test_disagree_df])
test_data_df = test_data_df.reset_index(drop=True)
test_data_df.columns = ['text', 'intent', 'entities']
test_data_df['entities'] = test_data_df.apply(lambda row: [change_keys(item) for item in row['entities']] , axis=1)
test_data = [test_data_df.loc[i].to_dict() for i in test_data_df.index]

In [None]:
# Export train data

with open ('train_inform.json', 'w') as f:
    json.dump(train_inform, f, indent=4)
with open ('train_greeting.json', 'w') as f:
    json.dump(train_greeting, f, indent=4)
with open ('train_agree.json', 'w') as f:
    json.dump(train_agree, f, indent=4)
with open ('train_disagree.json', 'w') as f:
    json.dump(train_disagree, f, indent=4)

# Export test data
with open('test_data.json', 'w') as f:
    json.dump(test_data, f, indent=4)

Data are ready.