In [1]:
import logging
format_log = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(filename='process_cross_validated.log', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S %p')

# Process Dialogflow K-fold Cross Validation

## Overview
This is a Jupyter notebook to complete a stratified K-fold cross validation on Dialogflow training examples. Cross validation on the training data tests the models ability to generalize on a random sample a data and partitions out a portion of the training data to be tested on. Testing the model on data it has never seen before helps validate what the model has learned. If you didn't test on data the model has not seen before, it would be looking at a tests answers before taking the test. Not a great evaluation of what you learned.  

To learn more about K-fold view the links below:
+ [YouTube: Machine Learning Video on Cross Validation](https://www.youtube.com/watch?v=fSytzGwwBVw)
+ [Medium: 5 Reasons why you should use Cross-Validation in your Data Science Projects](https://towardsdatascience.com/5-reasons-why-you-should-use-cross-validation-in-your-data-science-project-8163311a1e79)

### Notebook Steps
The high level steps followed in the notebook are listed below:
1. Intialize all the parameters you need to run the notebook
2. Load the functions
3. Get intents, entities, training phrases and knowledge bases from Main Dialogflow agent
4. Create k-folds and Loop through each k-fold partition:
    + Build a dictionary to flatten the intents with phrases
    + Creates a new agent
    + Uploads intents knowledge base and training phrases to new agent
    + Trains the new agent
    + Makes intent predictions with the test data held out for validation
    + Puts the test phrase, actual intent, dialogflow predicted intent, dialogflow predicted confidence score into a Pandas Table
    + Appends the panda table to list to keep all the cross-validation results from all the loops.
5. Save the cross validation results


## 1. Intialize all the parameters you need to run the notebook

In [2]:
# import libraries
logging.info("Processing the k-fold cross-validation started")

import os
import time
import pandas as pd
import numpy as np
import pickle
from typing import List

from datetime import datetime
from tqdm import tqdm
from random import seed
from random import randint
from sklearn.model_selection import StratifiedKFold

#from google.cloud import dialogflow
from google.cloud import dialogflow_v2beta1 as dialogflow

logging.info("Python libraries loaded")

In [3]:
# Google Cloud JSON credentials
google_cloud_json_credentials_file_name = 'google_cloud_credentials.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = google_cloud_json_credentials_file_name

logging.info("Google Cloud Credentials Loaded")

In [4]:
# main agents project id
main_project_id = 'burgerbot-bskc'

# test agent project id
test_project_id = 'burgerbot-bskc-test-agent'

# the name for test agent in Dialogfow
test_agent_display_name = 'nlp_testing_agent'

#file path to filter intents that are trainable
trainable_file_path = 'data/input/trainable_intents/'
#file name to filter intents that are trainable
trainable_file_name = 'DoloresTraining_Current_4.xlsx'
#sheet name to filter intents that are trainable
trainable_sheet_name = 'EN locale - Training Queries'
#column that contain information to filter intents
trainable_use_cols = 'A,C,I,L'

# the default language for the test agent
default_language_code="en"

# the time zone for the test agent
use_time_zone="America/Los_Angeles"

# api version of agent
api_version = 'API_VERSION_V2_BETA_1'

# pricing tier to use testing dialogflow agent
use_tier="TIER_STANDARD"
#use_tier="TIER_ENTERPRISE"

# number of k-folds to complete
number_of_k_splits = 5

# the delay time between api requests and general pauses to prevent the API quota issues (recommend 60 sec)
api_sleep_time = 60

# the number of number of requests to complete before backing up to prevent API quota issues (recommend 50-55)
api_interval_limit = 50

logging.info("Parameters initialized")

In [5]:
# the file name to save the results to
now = datetime.now()
dt_string = now.strftime("%Y_%m_%d_%H_%M_%S")
save_file_name = "{}_cross_valid_results_.pickle".format(dt_string)
save_file_path = "data/processed/"

# createing paths in main agent and test agents that will used for API calls
main_project_path = 'projects/{}'.format(main_project_id)
main_agent_path = 'projects/{}/agent'.format(main_project_id)

test_project_path = 'projects/{}'.format(test_project_id)
test_agent_path = 'projects/{}/agent'.format(test_project_id)

# creating all the clients needed to complete the job
agent_client = dialogflow.AgentsClient()
intent_client = dialogflow.IntentsClient()
entity_type_client = dialogflow.EntityTypesClient()
session_client = dialogflow.SessionsClient()
kb_client = dialogflow.KnowledgeBasesClient()
doc_client = dialogflow.DocumentsClient()

# intializating the the intent view for the function get_training_phrases
intent_view = dialogflow.IntentView.INTENT_VIEW_FULL


logging.info("Initial variables created")

## 2. Load Functions

In [6]:
def get_training_phrases(dict_key, intent_client):
    '''
    Overview: A function to get training phrases associated with a Dialogflow intent
    Depends On Function: None
    Constraints: You have a list intents in a dictionary from Dialogfow in the following format:
    
    'projects/burgerbot-bskc/agent/intents/0026647a-e6a1-46bb-9835-bc7e4962efe5': ('faq1.00-warranty.square.trade','0026647a-e6a1-46bb-9835-bc7e4962efe5'),
    
    Input:
        1. dict_key (str): The key value associated with the intent dictionary created
        2. intent_client (Dialogflow intent client session): Ex: intent_client = dialogflow.IntentsClient()
        
    Output:
        1. intent_name (str): The name of intent in string format
        2. phrases (list): All the phrases associated with the intent in list format
    
    '''
    intent_path = dict_key
    intent_name = intent_dict[key]['intent_name']
    get_intent_request = dialogflow.GetIntentRequest(name=intent_path, intent_view=intent_view)
    intent_returned = intent_client.get_intent(get_intent_request)
    phrases = list(intent_returned.training_phrases)
    #entities = 
    return intent_name, phrases



def upload_training_phrases(dict_key, intent_client):
    '''
    Overview: A function to upload intents and training phrases to Dialogflow
    Depends On Function: None
    Constraints: 
        1. A properly formated dictionary object: {intent name : [phrase 1, phrase 2, phrase 3...phrase x]}
    Input: 
        1. dict_key (dictionary):
        2. intent_client (Dialogflow intent client session): Ex: intent_client = dialogflow.IntentsClient()
    Output: 
        1. Intent created in Dialogflow with associated training phrases
    '''
    intent_name = dict_key #save intent name
    # loop through dictionary
    training_phrases = []
    training_phrases_parts = phrase_dict[key]
    for training_phrases_part in training_phrases_parts:
        part = dialogflow.Intent.TrainingPhrase.Part(
            text=training_phrases_part)
        # Here we create a new training phrase for each provided part.
        training_phrase = dialogflow.Intent.TrainingPhrase(parts=[part])
        training_phrases.append(training_phrase)

    intent = dialogflow.Intent(display_name = intent_name, training_phrases = training_phrases)
    create_intent_request = dialogflow.CreateIntentRequest(parent=test_agent_path, intent=intent)
    try:
        response = intent_client.create_intent(create_intent_request)
    except Exception as e:
        print(e)
        pass
    
def batch_upload_training_phrases(dict_key, intent_client):
    intent_name = dict_key #save intent name
    

def random_with_N_digits(n):
    '''
    Overview: A function to create 9 random digits for a Dialogflow session
    Depends On Function: None
    Constraints: Input must be an integer
    Input: 
        1. n (int): Ex: 9
    Output: 
        1. Nine random digits (int). Ex: 227454785
    '''
    range_start = 10**(n-1)
    range_end = (10**n)-1
    return randint(range_start, range_end)


def dialogflow_prediction(input_text, language_code, session, query_params):
    '''
    Overview: A function to predict the intent name and confidence score from Dialogflow
    Depends On Function: None
    Constraints: A Dialogflow session created and string of text less than 768 characters (10/1/21). 
    Input: 
        1. input_text (str): The phrase you want to predict. 
        2. language_code (str): The language you are predicting
        3. session (dialogflow session): A diaogflow session to have the correct permissions.
        4. query_params (dialogflow knowledge base): A path to a dialogflow knowledge base to make predictions
    Output:
        1. predicted intent name (str), confidence score of prediction (float)
        
    '''
    
    text_input = dialogflow.TextInput(text=input_text, language_code=language_code)
    query_input = dialogflow.QueryInput(text=text_input)
    request = dialogflow.DetectIntentRequest(session=session, query_input=query_input, query_params=query_params)
    response = session_client.detect_intent(request=request)   
    #response = session_client.detect_intent(request={"session": session, "query_input": query_input})
    predicted_intent_name = response.query_result.intent.display_name
    predicted_confid_score = response.query_result.intent_detection_confidence
    return predicted_intent_name, predicted_confid_score

def create_entity_type_list_from_DF_list(dialogflow_entity_list):
    '''
    Overview: A function to create entities in a new Dialogflow agent 
    Depends On Function: None
    Constraints: Entity list must be in the format of entity_types.pagers.ListEntityTypesPager. 
                 Ex: entity_types_list = entity_type_client.list_entity_types(parent=agent_path)
    Input:
        1. dialogflow_entity_list (list): A Dialogflow list of entity of types
        2. test_project_id (str): The test project name to upload the entities to
    Output:
        2. entity_upload_list (list): A list entities that can input into a Dialogflow to create batches entities of once in Dialogflow 
    '''
    entity_upload_list = []
    entity_list = list(dialogflow_entity_list)

    for i in range(len(entity_list)):
        entity_type_display_name = entity_list[i].display_name
        entity_type_kind = entity_list[i].kind
        entity_type_entities = entity_list[i].entities
        entity = dialogflow.EntityType(display_name=entity_type_display_name, kind=entity_type_kind, entities=entity_type_entities)
        entity_upload_list.append(entity)

    return entity_upload_list


def batch_create_entity_types(entity_list, entity_client, agent_path):
    '''
    Overview: A function to upload a batch of entity types to Dialogflow
    Depends On Function: None
    Constraints:
        1. The entity_list must be in the format like the function create_entity_type_list_from_DF_list outputs. 
        2. The entity types must NOT already exist in the agent they are being added to. 
    Input:
        1. entity_list (list):  A list entities that can input into a Dialogflow to create batches entities of once in Dialogflow 
        2. entity_client (Dialogflow EntityTypesClient): The Dialogflow entity client used to make API calls to entity types
        3. agent_path (str): The path to agent you wish to upload the entity types to: Ex: projects/burgerbot-bskc-test-agent/agent
        
    Outputs:
        1. response: The response from Dialogflow
    '''
    entity_types: List[dialogflow.EntityType] = entity_list
    entity_types_batch = dialogflow.EntityTypeBatch(entity_types=entity_types)
    batch_update_entity_type_request = dialogflow.BatchUpdateEntityTypesRequest(parent=agent_path, entity_type_batch_inline=entity_types_batch)
    response = entity_client.batch_update_entity_types(batch_update_entity_type_request, timeout=120)
    return response


def create_intent_list(intents, ignore_ml_disabled=True):
    '''
    Oveview: A function to create a dictionary of intents to create in test agent
    Depends On: None
    Constraints: Function has been tested mostly igorning intents disabled for machine learning. 
    Input:
        1. intents (ListIntentsPager): A list of intents from Dialogflow API
    Outputs:
        1. intent_dict (dict): A dictionary containing intent path as the key and the intent display name and intent id as values
        Ex: projects/burgerbot-bskc/agent/intents/01a0d3bf-582a-4f4a-81b2-67763d65f6b4': ('ask_transfer_charge', '01a0d3bf-582a-4f4a-81b2-67763d65f6b4')
    
    '''
    intent_dict = {}
    if  ignore_ml_disabled==True:
        # save intent ids to list
        for intent in intents:
            intent_path = intent.name
            ml_disabled = intent.ml_disabled
            if ml_disabled == False:
                intent_display_name = intent.display_name
                splitted = str(intent_path).split("/")
                intent_id = splitted[-1]
                intent_dict[intent_path] = (intent_display_name, intent_id)
            else:
                pass
        return intent_dict
    
    if ignore_ml_disabled==False:
        # save intent ids to list
        for intent in intents:
            intent_path = intent.name
            ml_disabled = intent.ml_disabled
            if ml_disabled == False:
                intent_display_name = intent.display_name
                splitted = str(intent_path).split("/")
                intent_id = splitted[-1]
                intent_dict[intent_path] = (intent_display_name, intent_id)
            else:
                pass
        return intent_dict   
            

def load_trainable_intents(trainable_file_path, trainable_file_name, trainable_sheet_name, trainable_use_cols, google_sheet=False):
    '''
    Overview: A function to load an Excel file of intents to train and not train in Dialogflow
    Depends On: Having an excel sheet in a certain format of intents to train
    Constraints: The format of the Excel spreadsheet should not change
    Input:
        1. trainable_file_path (str): The path on local computer the excel spreadsheet of trainable intents exists
        2. trainable_file_name (str): The file name of the excel spreadsheet of trainable intents
        3. trainable_sheet_name (str): The Excel sheet name to load of the Excel spreadsheet of trainable intents
        4. trainable_use_cols (str): A list of columns to load from the Excel spreadhsheet. Ex: A,B,G
    Output:
        1. train_intent_df (pandas dataframe): A dataframe of intent_name, with a value of 1 or 0 if the intent should be trained
    '''
    
    if google_sheet==False:
        train_intent_df = pd.read_excel(trainable_file_path+trainable_file_name, sheet_name=trainable_sheet_name, usecols=trainable_use_cols)
        train_intent_df = train_intent_df[train_intent_df['trainable?'] == 'Yes']
        train_intent_df['utterance_change'] = train_intent_df['utterance_change'].fillna('')
        train_intent_df['utterance_change'] = [x.strip() for x in train_intent_df['utterance_change']]
        train_intent_df['utterance_change'] = [x.lower() for x in train_intent_df['utterance_change']]

        train_intent_df = train_intent_df[train_intent_df['utterance_change'] != 'remove']
        train_intent_df = train_intent_df.rename(columns={'intentName': 'intent name', 'text':'train'}) 

        intent_count = train_intent_df.groupby('intent name')['train'].count().reset_index()
        intent_count['train'] = 1
        train_intent_df = intent_count.set_index('intent name')
        return train_intent_df
        
    else:
        train_intent_df = pd.read_excel(trainable_file_path+trainable_file_name, sheet_name=trainable_sheet_name, usecols=trainable_use_cols)
        train_intent_df = train_intent_df.reset_index(drop=True)
        train_intent_df = train_intent_df.rename(columns={'Training?':'train'})
        train_intent_df['train'] = np.where(train_intent_df['train']=='Yes',1,0)
        train_intent_df = train_intent_df.set_index('intent name')
        return train_intent_df


def filter_trainable_intents(dialogflow_intent_dict, trainable_file_path, trainable_file_name, trainable_sheet_name, trainable_use_cols): 
    '''
    Overview: A function to filter intents that are not supposed to be trained on
    Depends On: 
        1. Having an excel sheet in a certain format of intents to train
        2. The function load_trainable_intents
    Contraints: None
    Input:
        1. dialogflow_intent_dict (dict): A dictionary of dialogflow intents
        2. trainable_file_path (str): The path on local computer the excel spreadsheet of trainable intents exists
        3. trainable_file_name (str): The file name of the excel spreadsheet of trainable intents
        4. trainable_sheet_name (str): The Excel sheet name to load of the Excel spreadsheet of trainable intents
        5. trainable_use_cols (str): A list of columns to load from the Excel spreadhsheet. Ex: A,B,G
    Output:
        1. intect_dict (dict): A filtered dictionary of dialogflow intents
    '''
    df_train = load_trainable_intents(trainable_file_path, trainable_file_name, trainable_sheet_name, trainable_use_cols)
    
    intent_df = pd.DataFrame.from_dict(dialogflow_intent_dict).transpose().reset_index()
    intent_df = intent_df.rename(columns={'index':'intent_path',0:'intent_name',1:'intent_id'})
    intent_df = intent_df.set_index('intent_name')
    intent_df = intent_df.merge(df_train, how='left', left_index=True, right_index=True)
    intent_df['train'] = intent_df['train'].fillna(0).astype(int)
    intent_df = intent_df[intent_df['train']==1]
    
    intent_df = intent_df.reset_index()
    intent_df = intent_df.rename(columns={'index':'intent_name'})
    intent_df = intent_df.drop_duplicates(subset=['intent_path'], keep='last')
    intent_df = intent_df.set_index('intent_path')
    intect_dict = intent_df.to_dict(orient='index')
    return intect_dict

def pepare_df_text_for_prediction(dialogflow_phrases):
    '''
    Overview: A function to transform the training data which is in Dialogflow parts to raw text to make predictions.
    Depends On Function: None
    Constraints: The data input must must be array and Dialogflow parts of training phrases
    Input:
        1. dialogflow_phrases (array): An array of Dialogflow parts of training phrases
    Output:
        1. prediction_list (list): A list of training phrases converted from Dialogflow parts to raw text. 
    '''
    
    prediction_list = []
    for text in dialogflow_phrases:
        if len(text) >= 2:
            new_list = []
            for words in text:
                new_list.append(words.text)

            new_list = "".join(new_list)
            prediction_list.append(new_list)
        else:
            prediction_list.append(text[0].text)
            
    return prediction_list


# get existing knowledge ids
def get_existing_knowledge_graph_id(project_id, client, keep_list):
    '''
    Overview: A function to get a list of knowledge bases in Dialgflow and filter out ones to keep
    Depends On Function: None
    Constraints:
    Input:
        1. project_id (str): The main agents project
        2. client (Dialogflow KnowledgeBasesClient): The intialized Dialogflow client to work with Knowledge Base)
        3. keep_list (list): A list containing the string of Knowledge Bases names from the Main Agent you want to copy to the test agent.
    Output:
        1. A list of knowledge bases to to use for cross validation in the Test Agent.
    '''
    parent_path = 'projects/{}'.format(project_id)
    kb_list = client.list_knowledge_bases(parent=parent_path)
    
    #filter any KB's you want to keep or not keep
    kb_list_keep = []
    for kb in kb_list:
        if kb.display_name in keep_list:
            kb_list_keep.append(kb)
        else:
            pass
    
    return kb_list_keep


def get_knowledge_base(project_id, kb_client, knowledge_base_id):
    '''
    Overview: A function to retrive a certain knowledge base from Dialogflow
    Depends On Function: None
    Constraints: 
    Input:
        1. project_id (str): The Google Cloud project the Dialogflow bot is in
        2. kb_client (Dialogflow Knowledge Graph Client): The Dialogflow knowlege base client
        3. knowledge_base_id (str): The Dialogflow knowledge base id of the knowledge base you want to retrieve. 
    Output:
        1. The dialogflow flow knowledge base 
    '''
    knowledge_base_path = kb_client.knowledge_base_path(project_id, knowledge_base_id)
    response = kb_client.get_knowledge_base(name=knowledge_base_path, timeout=240)
    return response

def get_knowledge_base_docs(project_id, kb_client, doc_client, knowledge_base_id):
    '''
    Overview: A function to get list of documents stored in a Dialogflow knowledge base
    Depends On Function: 
    Constraints: 
    Input:
        1. project_id (str): The Google Cloud project the Dialogflow bot is in
        2. kb_client (Dialogflow Knowledge Graph Client): The Dialogflow knowlege base client
        3. doc_client (Dialoflow Document Client): The Dialogflow document client
        4. knowledge_base_id (str): The Dialogflow knowledge base id of the knowledge base you want to retrieve documents from. 
    Output:
        1. A list of documents that are stored in a Dialogflow knowledge base
    '''
    knowledge_base_path = kb_client.knowledge_base_path(project_id, knowledge_base_id)
    response = doc_client.list_documents(parent=knowledge_base_path, timeout=240)
    return response


def create_empty_knowledge_base(project_id, kb_client, display_name):
    '''
    Overview: A function to create an empty knowledge base in Dialogflow
    Depends on Function:
    Constraints:
    Input:
        1. project_id (str): The Google Cloud project the Dialogflow bot is in
        2. kb_client (Dialogflow Knowledge Graph Client): The Dialogflow knowlege base client
        3. display_name (str): The name you want the new knowledge base to have
    Output:
        1. A new knowledge base created in Dialogflow
        
    '''
    parent_path = 'projects/{}'.format(project_id)
    knowledge_base = dialogflow.KnowledgeBase(display_name=display_name)
    response = kb_client.create_knowledge_base(parent=parent_path, knowledge_base=knowledge_base, timeout=120)
    return response

def add_doc_to_knowledge_base(project_id, doc_client, knowledge_base_id, display_name, mime_type, raw_content, knowledge_type):
    '''
    Overview: A function to add Dialogflow document to a Dialogflow knowledge base
    Depends On Function:
    Constraints:
    Input:
        1. project_id (str): The Google Cloud project the Dialogflow bot is in
        2. doc_client (Dialoflow Document Client): The Dialogflow document client
        3. knowledge_base_id (str): The Dialogflow knowledge base id of the knowledge base you want to retrieve documents from
        4. display_name (str): The name you want the new document to have
        5. mime_type (str): The MIME type of the Dialogflow document.
        6. raw_content (str): The raw content of the Dialogflow document. This field is only permitted for EXTRACTIVE_QA and FAQ knowledge types.
        7. knowledge_type (list): The knowledge types of the document. Like FAQ, Extractive QA. 
    Output:
        1. Document created in Dialogflow knowledge base
    '''
                              
    knowledge_base_path = dialogflow.KnowledgeBasesClient.knowledge_base_path(project_id, knowledge_base_id)
    document = dialogflow.Document(display_name=display_name, mime_type=mime_type, raw_content=raw_content)
    document.knowledge_types.append(getattr(dialogflow.Document.KnowledgeType, knowledge_type))
    response = doc_client.create_document(parent=knowledge_base_path, document=document)
    document = response.result(timeout=240)
    print("Created Document:")
    print(" - Display Name: {}".format(document.display_name))
    print(" - Knowledge ID: {}".format(document.name))
    
    
logging.info("Functions created")   

## 3. Get intents, entities, training phrases and knowledge bases from Main Dialogflow agent

In [7]:
# get list of intents from main agent

intent_list = intent_client.list_intents(parent=main_agent_path)
intent_dict = create_intent_list(intent_list)

logging.info("Received list of intents from main agent") 

In [8]:
# if you want to filter from a file, you can use this function. If not, comment it out.

intent_dict = filter_trainable_intents(intent_dict, trainable_file_path, trainable_file_name, trainable_sheet_name, trainable_use_cols) 

logging.info("Filtered intents based off Excel file") 

In [9]:
# get list of all entity types from main agent

entity_types_list = entity_type_client.list_entity_types(parent=main_agent_path)
entity_types_list = create_entity_type_list_from_DF_list(entity_types_list)

logging.info("Received entities from main agent") 

In [10]:
# get intent training phrases

train_intent = []
intent_assoc_parts = []
#time.sleep(api_sleep_time) # wait 60 seconds before starting to prevent API quota issues

'''
This loop gets the key and value from intect_dict.
It keeps track of how many times the loop has happened to keep under the Dialogflow free-tier API limit. If you go over the count limit,
which is set in a variable function in the intialize section, the loop pauses for a certain amount of time, then restarts again. 
It uses the function get_training_phrases to get an intent name and all its associated training phrases
    Another loop gets and invividual phrase from the phrase list
    Phrases from Dialogflow have parts, so within one phrase you could have 4 parts. Parts could be just text or entities. 
        A third loop loops through all the intent parts and saves them to list. So you have intent name and all its associated
        parts one-to-one. Which is the main output from all these loops. 

At this time, if an intent does have more than two training phrases, you cannot do cross-validation on it. So if an intent does have 
more than two phrases, it is skipped. 
        
'''
count = 0
for key, v in tqdm(intent_dict.items()):
    if count < api_interval_limit:
        intent_name, phrases = get_training_phrases(key, intent_client)
        if len(phrases) >= 2:
            for phrase in phrases:
                intent_parts = []
                for i in range(len(phrase.parts)):
                    intent_parts.append(phrase.parts[i])
                
                train_intent.append(intent_name)
                intent_assoc_parts.append(intent_parts)
                    
        else:
            pass
        
        count +=1 
    
    elif count >=api_interval_limit:
        time.sleep(api_sleep_time)
        intent_name, phrases = get_training_phrases(key, intent_client)
        if len(phrases) >= 2:
            for phrase in phrases:
                intent_parts = []
                for i in range(len(phrase.parts)):
                    intent_parts.append(phrase.parts[i])
                
                train_intent.append(intent_name)
                intent_assoc_parts.append(intent_parts)
        else:
            pass
        
        count = 0

logging.info("Received all intent training phrases")

100%|██████████| 244/244 [05:17<00:00,  1.30s/it]


In [11]:
# get existing knowledge base ids from main agent
existing_kbs = get_existing_knowledge_graph_id(main_project_id, kb_client, keep_list=['QA_Pairs_US'])
knowledge_base = existing_kbs[0]
knowledge_base_id = knowledge_base.name.split('/')[-1]

logging.info("Received all knowledge base ids from main agent")

In [12]:
# get documents from existing knowledge base in main agent
existing_docs = get_knowledge_base_docs(main_project_id, kb_client, doc_client, knowledge_base_id)
us_doc = existing_docs.documents[1]
uk_doc = existing_docs.documents[0]

logging.info("Received all relevant documents from main agent")

In [13]:
# Prepares data from loop above to be put in Pandas dataframe
input_data = list(zip(intent_assoc_parts, train_intent))
data = pd.DataFrame(data=input_data, columns=['parts','intent'])

# X training values. The values that will be used to train (training phrases)
# y prediction values. That values that will be predicted (intent name)
X = data['parts'].values
y = data['intent'].values

logging.info("Split data into train and test")

In [14]:
#prepare Dialogflow parts for prediction by converting parts to text
#prediction_list = pepare_df_text_for_prediction(X)

## 4. Create k-folds and Loop through each k-fold partition:

In [15]:
#fold the data
skf  = StratifiedKFold(n_splits=number_of_k_splits, shuffle=True, random_state=100)
skf.get_n_splits(X)

#delete agent if one already exists
print("Deleting old project if one exists")
try:
    
    response = agent_client.delete_agent(parent=test_project_path, timeout=120)
except Exception as e:
    print(e)
    
logging.info("Deleted the old agent")

print("Sleeping for 60 seconds...")
time.sleep(api_sleep_time) # wait 60 seconds before starting to prevent API quota issues


# loop through the folded data
print("Looping through the folded data")
logging.info("Looping through the folded data")

test_num = 1
results_df = []
for train_index, test_index in skf.split(X, y):
    logging.info("The {} k-fold loop started".format(test_num))
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # build dictionary of intents with values as list of training phrases
    key_set = list(set(y_train))
    phrase_dict = {key.strip() : [] for key in key_set}
        
    for i in range(len(y_train)):
        key = y_train[i].strip()
        value = X_train[i]
        phrase_dict[key].append(value)
        
    logging.info("Built dictionary of intents with training phrases")
        
    #create agent
    print("Creating agent")
    agent = dialogflow.Agent(parent=test_project_path, display_name=test_agent_display_name, default_language_code=default_language_code, time_zone=use_time_zone, tier=use_tier)
    response = agent_client.set_agent(request={"agent": agent}, timeout=240)
    time.sleep(60) #added in case it goes too fast
    logging.info("Created the agent")
    
    # create new empty knowledge base
    new_kb = create_empty_knowledge_base(test_project_id, kb_client, knowledge_base.display_name)
    new_kb_id = new_kb.name.split("/")[-1]
    new_kb_path = dialogflow.KnowledgeBasesClient.knowledge_base_path(test_project_id, new_kb_id)
    time.sleep(10) #added in case it goes too fast
    logging.info("Created a new empty knowledge base")

    # add us document to new knowledge base
    display_name = us_doc.display_name
    mime_type = us_doc.mime_type
    raw_content = us_doc.raw_content
    knowledge_type = 'FAQ'
    add_doc_to_knowledge_base(test_project_id, doc_client, new_kb_id, display_name, mime_type, raw_content, knowledge_type)
    time.sleep(10) #added in case it goes too fast
    logging.info("Added US document to knowledge base")
    
    # add uk document to new knowledge base
    display_name = uk_doc.display_name
    mime_type = uk_doc.mime_type
    raw_content = uk_doc.raw_content
    knowledge_type = 'FAQ'
    add_doc_to_knowledge_base(test_project_id, doc_client, new_kb_id, display_name, mime_type, raw_content, knowledge_type)
    time.sleep(10) #added in case it goes too fast
    logging.info("Added UK document to knowledge base")
    
    # batch upload entity types from main agent to test agent.
    batch_create_entity_types(entity_types_list, entity_type_client, test_agent_path)
    time.sleep(60) #added in case it goes too fast
    logging.info("Uploaded entity types from main agent to test agent")
    
    #batch upload intents and training phrases to new agent
    intent_upload_list = []
    for key,v in tqdm(phrase_dict.items()):
        intent_name = key #save intent name
        if intent_name != 'Default Welcome Intent':
            training_phrases = []
            training_phrases_parts = phrase_dict[key]
            #print(training_phrases_parts)
            for part in training_phrases_parts:
                #print(part)
                training_phrase = dialogflow.Intent.TrainingPhrase(parts=part)
                training_phrases.append(training_phrase)
            intent = dialogflow.Intent(display_name = intent_name, training_phrases = training_phrases)
            intent_upload_list.append(intent)
        else:
            pass

    intents: List[dialogflow.Intent] = intent_upload_list
    intent_batch = dialogflow.IntentBatch(intents=intents)
    batch_update_intent_request = dialogflow.BatchUpdateIntentsRequest(parent=test_agent_path, intent_batch_inline=intent_batch)
    response = intent_client.batch_update_intents(batch_update_intent_request, timeout=120)
    logging.info("Batch uploaded intents and training phrases to new agent")

    # train the agent #1
    print("Training Agent 1")
    train_agent_request = dialogflow.TrainAgentRequest(parent=test_project_path)
    response = agent_client.train_agent(train_agent_request, timeout=240)
    time.sleep(120) #added in case it goes too fast
    logging.info("Trained the agent the 1st time")
    
    # train the agent #2
    print("Training Agent 2")
    train_agent_request = dialogflow.TrainAgentRequest(parent=test_project_path)
    response = agent_client.train_agent(train_agent_request, timeout=240)
    time.sleep(120) #added in case it goes too fast
    logging.info("Trained the agent the 2nd time")

    
    # create a new session with agent
    session = session_client.session_path(test_project_id, random_with_N_digits(9))
    logging.info("Created a new session")
    
    #prepare Dialogflow parts for prediction by converting parts to text
    prediction_list = pepare_df_text_for_prediction(X_test)
    logging.info("Converted Dialogflow parts to text for prediction")

    #predict the intent name and confidence score
    print("Making predictions")
    logging.info("Started making predictions")
    predicted_intent_name = []
    predicted_confid_score = []
    
    query_params = dialogflow.QueryParameters(knowledge_base_names=[new_kb_path])
    
    
    count = 0
    for i in tqdm(range(len(prediction_list))):
        if count < api_interval_limit:
            
            rpc_error = True
            while rpc_error==True:
                try:
                    predicted_intent, confidence_score = dialogflow_prediction(prediction_list[i], default_language_code, session, query_params)
                    rpc_error = False
                except:
                    rpc_error = True
            predicted_intent_name.append(predicted_intent)
            predicted_confid_score.append(confidence_score)
            count +=1   
            
        elif count ==api_interval_limit:
            time.sleep(api_sleep_time)
            
            rpc_error = True
            while rpc_error==True:
                try:
                    predicted_intent, confidence_score = dialogflow_prediction(prediction_list[i], default_language_code, session, query_params)
                    rpc_error = False
                except:
                    rpc_error = True
                
            predicted_intent_name.append(predicted_intent)
            predicted_confid_score.append(confidence_score)
            count = 0
    logging.info("Ended making predictions")
    
    # create dataframe of actual vs predicted
    data = list(zip(prediction_list, y_test, predicted_intent_name, predicted_confid_score))
    columns = ['text','actual_intent','pred_intent','pred_conf']
    df_f1 = pd.DataFrame(data, columns=columns)
    df_f1['test_num'] = test_num
    results_df.append(df_f1)
    logging.info("Created dataframe of actual vs predicted")
    
    #delete agent once done to create a new agent with different training phrases
    print("Deleting Agent")
    response = agent_client.delete_agent(parent=test_project_path, timeout=120)
    logging.info("Agent Deleted")
    
    test_num += 1
    logging.info("The {} k-fold loop completed".format(test_num))
     
    

Deleting old project if one exists
'NoneType' object has no attribute 'Call'
Sleeping for 60 seconds...
Looping through the folded data
Creating agent
Created Document:
 - Display Name: QA_Pairs_US
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/NzI3MTA4MTM4OTU5ODk2NTc2/documents/MTczNDI4MTczNTY5OTIyMTcwODg
Created Document:
 - Display Name: QA_Pairs_UK
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/NzI3MTA4MTM4OTU5ODk2NTc2/documents/OTg0ODgyNzU3NzA0NzcxMTc0NA


100%|██████████| 243/243 [00:00<00:00, 1472.77it/s]


Training Agent 1
Training Agent 2
Making predictions


100%|██████████| 3997/3997 [1:38:10<00:00,  1.47s/it]  


Deleting Agent
Creating agent
Created Document:
 - Display Name: QA_Pairs_US
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTMxMDA3NDgxMTUxNjAzMzQzMzY/documents/MTQwNjg3MDA0Mjc4OTM4NjY0OTY
Created Document:
 - Display Name: QA_Pairs_UK
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTMxMDA3NDgxMTUxNjAzMzQzMzY/documents/MTc3Mzg3MzQ0NTE3MjQxMjQxNg


100%|██████████| 243/243 [00:00<00:00, 1413.87it/s]


Training Agent 1
Training Agent 2
Making predictions


100%|██████████| 3997/3997 [1:37:54<00:00,  1.47s/it]  


Deleting Agent
Creating agent
Created Document:
 - Display Name: QA_Pairs_US
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTI4Mzk1MzkzMzY3NzI4NDU1Njg/documents/MTU3OTgwODI2ODQ4MDQxMzY5NjA
Created Document:
 - Display Name: QA_Pairs_UK
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTI4Mzk1MzkzMzY3NzI4NDU1Njg/documents/MTQ3NDQyNDAzNzE5OTk0NDA4OTY


100%|██████████| 243/243 [00:00<00:00, 1555.69it/s]


Training Agent 1
Training Agent 2
Making predictions


100%|██████████| 3997/3997 [1:38:42<00:00,  1.48s/it]  


Deleting Agent
Creating agent
Created Document:
 - Display Name: QA_Pairs_US
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTgwOTk3NDM3MDE1NDE1ODQ4OTY/documents/MTU1MDk4NTIzMDg2NTI0MjUyMTY
Created Document:
 - Display Name: QA_Pairs_UK
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/MTgwOTk3NDM3MDE1NDE1ODQ4OTY/documents/MTM0NTE3MDcyNzg5NDQxMDg1NDQ


100%|██████████| 243/243 [00:00<00:00, 1556.06it/s]


Training Agent 1
Training Agent 2
Making predictions


100%|██████████| 3996/3996 [1:38:00<00:00,  1.47s/it]  


Deleting Agent
Creating agent
Created Document:
 - Display Name: QA_Pairs_US
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/NDQ5NDM2OTIyNzI1NTMxNjQ4MA/documents/MTA1NzgwMTEwNDQyMDUwMzU1Mg
Created Document:
 - Display Name: QA_Pairs_UK
 - Knowledge ID: projects/burgerbot-bskc-test-agent/knowledgeBases/NDQ5NDM2OTIyNzI1NTMxNjQ4MA/documents/MTgwNTg4ODk2OTc3NDQxMjU5NTI


100%|██████████| 243/243 [00:00<00:00, 1413.83it/s]


Training Agent 1
Training Agent 2
Making predictions


100%|██████████| 3996/3996 [1:38:18<00:00,  1.48s/it]  


Deleting Agent


## 5. Save the cross validation results

In [16]:
# the view the results of the a k-fold test
#results_df[3]

In [17]:
# save the data to pickle to be loaded for analysis

pd.to_pickle(results_df,save_file_path+save_file_name)
logging.info("The file is saved")
logging.info("Processing the k-fold cross-validation is complete")

In [None]:
1+1