# NLU pipeline evaluation
This notebook is specifically designed to evaluate the NLU pipeline of the HMD project. It combines the data generation and the NLU part.

## Setup NLU

In [None]:
%env HF_HOME=/kaggle/working #for Kaggle

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Get the HF_TOKEN
hf_token = os.getenv('HF_TOKEN')
print(f'HuggingFace Token: {hf_token}')

In [2]:
MODELS = {
    "llama3": "meta-llama/Meta-Llama-3-8B-Instruct",
}

TEMPLATES = {
    "llama3": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
}

In [3]:
FIRST_TIME = False

if FIRST_TIME:
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import torch
    
    def download_models(models):
        for model_name in models.values():
            # triggers download of the models
            AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=torch.float16
            )
            AutoTokenizer.from_pretrained(model_name)
    
    download_models(MODELS)

In [4]:
import torch
import json

from typing import Tuple
from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding, PreTrainedTokenizer, PreTrainedModel

def load_model(model_name: str, dtype) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    torch_dtype = torch.float32
    if dtype == "bf16":
        torch_dtype = torch.bfloat16
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch_dtype,

    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

def generate(
    model: PreTrainedModel,
    inputs: BatchEncoding,
    tokenizer: PreTrainedTokenizer,
    max_seq_length: int,
) -> str:
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_seq_length,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(
        output[0][len(inputs.input_ids[0]) :], skip_special_tokens=True
    )

In [None]:
model_name = "llama3"
chat_template = TEMPLATES[model_name]
model_name = MODELS[model_name]

dtype = "bf16"
#max_seq_length = 1024
max_seq_length = 2048

model, tokenizer = load_model(model_name, dtype)

## NLU prompt

In [6]:
NLU_PROMPT = """
You are a Natural Language Understanding component of a human dialogue system.
Identify the user intent from this list: [chicken_ordering, drink_ordering, dessert_ordering, table_reservation, request_information, out_of_domain].
- chicken_ordering, if the user wants to order chicken.
- drink_ordering, if the user wants to order a drink.
- dessert_ordering, if the user wants to order a dessert.
- appetizer_ordering, if the user wants to order an appetizer.
- table_reservation, if the user wants to reserve a table.
- request_information, if the user wants to know about something.
- out_of_domain, if nothing of the above fits.

If the intent is chicken_ordering, extract the following slot values from the user input:
- chicken_type, the type of chicken. Choose from: ["grilled", "roasted"].
- chicken_size, the size of the chicken. Choose from: ["small", "medium", "large"].
- chicken_bones, if the user wants the chicken with bones in it. Choose from: ["yes", "no"].
- sauce_type, the type of sauce that is served with the chicken. Choose from: ["mushroom", "pesto", "none"].
- side_dish, the side dish that is served with the chicken. Choose from: ["carrots", "smashed potatoes", "none"]. 
If no values are present in the user input you have to put "null" as the value. If the value is not among the candidates, then put "null". Output them in a JSON format. Only output the JSON file.
The JSON format is:
{
    "intent": "intent_value",
    "slots": {
        "slot1": "value1",
        "slot2": "value2",
        "slot3": "value3",
        "slot4": "value4",
        "slot5": "value5"
    }
}


If the intent is drink_ordering, extract the following slot values from the user input:
- drink_type: the type of drink. Choose from: ["Coca Cola", "Fanta", "Sprite", "Water"].
- drink_size, the size of the drink. Choose from: ["small", "medium", "large"].
- ice: if the user wants ice in the drink. Choose from: ["yes", "no"].
If no values are present in the user input you have to put "null" as the value. If the value is not among the candidates, then put "null". Output them in a JSON format. Only output the JSON file.


If the intent is dessert_ordering, extract the following slot values from the user input:
- dessert_type, the type of dessert. Choose from: ["tiramisu", "ice cream", "apple crumble pie", "waffles", "smoothie"].
- extra_whipped_cream, if the user wants extra whipped cream. Choose from: ["yes", "no"].
If no values are present in the user input you have to put "null" as the value. If the value is not among the candidates, then put "null". Output them in a JSON format. Only output the JSON file.


If the intent is appetizer_ordering, extract the following slot values from the user input:
- appetizer_type, the type of appetizer. Choose from: ["crisps", "simple salad", "deluxe salad", "tomato soup", "onion soup"].
- appetizer_moment, the time before the main course that the appetizer should arrive. Choose an integer based on the user input.
If no values are present in the user input you have to put "null" as the value. If the value is not among the candidates, then put "null". Output them in a JSON format. Only output the JSON file.


If the intent is table_reservation, extract the following slot values from the user input:
- table_type, the type of table the user wants to reserve. Choose from: ["normal", "business", "romantic"].
- table_size, the number of people that should fit at the table. Choose an integer based on the user input.
- sitting_equipment, if the user wants chairs or benches. Choose from: ["chair(s)", "bench(es)", "mixed chairs and benches", "does not matter"].
- birthday_surprise, if the user is celebrating someone's birthday at the dinner. Choose from ["yes", "no"].
If no values are present in the user input you have to put "null" as the value. If the value is not among the candidates, then put "null". Output them in a JSON format. Only output the JSON file.


If the intent is request_information(entity):
Output it in a JSON format.
Only output the JSON file.
The JSON format is:
{
    "intent": "request_information(entity)"
}


If the intent is out_of_domain:
Output it in JSON format.
Only output the JSON file.
The JSON format is:
{
    "intent": "out_of_domain"
}
"""

## Evaluation methods
Use F1 and Accuracy for the slot values and intents respectively.

In [8]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_slot_predictions(ground_truth, predicted):
    """
    Evaluates slot prediction performance (Precision, Recall, F1) for structured data.

    Args:
        ground_truth (list of dict): List of ground truth dictionaries with slots.
        predicted (list of dict): List of predicted dictionaries with slots.

    Returns:
        float: F1 score for slot evaluation.
    """
    y_true = []
    y_pred = []

    for gt, pred in zip(ground_truth, predicted):
        # Extract slots only, ignoring intent
        gt_slots = gt.get("slots", {})
        pred_slots = pred.get("slots", {})

        # Compare slots (convert to sets of key-value tuples)
        gt_set = set(gt_slots.items())
        pred_set = set(pred_slots.items())

        # True positives, false negatives, and false positives
        y_true.extend([1] * len(gt_set))  # All ground truth slots are true
        y_pred.extend([1 if slot in pred_set else 0 for slot in gt_set])  # Match predictions with ground truth
        y_true.extend([0] * len(pred_set.difference(gt_set)))  # Extra predictions are false
        y_pred.extend([1] * len(pred_set.difference(gt_set)))  # Predicted slots not in ground truth

    # Compute precision, recall, and F1
    #precision = precision_score(y_true, y_pred, zero_division=0)
    #recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return f1#{"precision": precision, "recall": recall, "f1": f1}

In [9]:
def evaluate_intent_accuracy(ground_truth, predicted):
    """
    Evaluates the intent prediction accuracy.

    Args:
        ground_truth (list of dict): List of ground truth dictionaries with intent.
        predicted (list of dict): List of predicted dictionaries with intent.

    Returns:
        float: Intent accuracy.
    """
    correct = 0
    total = len(ground_truth)

    for gt, pred in zip(ground_truth, predicted):
        if gt["intent"] == pred["intent"]:
            correct += 1

    return correct

In [10]:
import re

def evaluate_intent_accuracy_phrase(ground_truth, predicted, phrase):
    """
    Evaluates the intent prediction accuracy for requesting information and out of domain.

    Args:
        ground_truth (list of dict): List of ground truth dictionaries with intent.
        predicted (list of dict): List of predicted dictionaries with intent.
        phrase (str): phrase to check for, must be "request_information" or "out_of_domain"

    Returns:
        float: Intent accuracy.
    """
    correct = 0
    total = len(ground_truth)

    assert len(ground_truth) == len(predicted)
    for i in range(len(predicted)):
        if phrase in str(predicted[i]):
            correct += 1
    if phrase == "request_information": # also check what we are requesting information for
        correct_info = 0
        for i in range(len(predicted)):
            match_gt = re.search(r'request_information\((.*?)\)', ground_truth[i]['intent']) #extract the subject from the gt
            
            extracted_value = match_gt.group(1)

            if extracted_value in str(predicted[i]):
                correct_info += 1
        return correct, correct_info
    else: # we just how many times it correctly said OOD
        return correct

In [11]:
def output_analysis(gt, pred, sent):
    for i in range(len(gt)):
        print(sent[i])
        print("GT:", gt[i])
        print("Prediction", pred[i])
        print()

## Dialogue Manager
Need DM to go from output of NLU to dictionary

In [12]:
class DialogueStateTracker:
    def __init__(self):
        self.previous_state = {"NLU": None, "DM": None}
        self.order_confirmed = False

    def update_nlu_state(self, nlu_out: str):
        nlu_out_dict = dict(eval(self.find_dictionary(nlu_out)))
        nlu_out_dict = self.change_null(nlu_out_dict)
        if self.previous_state["NLU"] is None:
            self.previous_state["NLU"] = nlu_out_dict
        else: 
            self.previous_state["NLU"] = self.deepmerge_dicts(self.previous_state["NLU"], nlu_out_dict)
        #self.previous_state["DM"] = None #reset from previous NBA
        return self.previous_state

    def check_slot_values_null(self):
        if 'slots' not in list(dst.get_state()["NLU"].keys()):
            return True #e.g. for request_information
        else:
            return None in list(dst.get_state()["NLU"]['slots'].values())

    def update_dm_state(self, dm_out: str):
        self.previous_state["DM"] = dm_out
        return self.previous_state
    
    def get_state(self):
        return self.previous_state
    
    def order_confirmation(self):
        self.order_confirmed = True
        #state = self.previous_state["DM"]
        #if str(state).__contains__("confirmation"):
        #    return True
        #else:
        #    return False

    def get_confirmation_status(self):
        return self.order_confirmed
    
    def change_null(self, dictionary: dict) -> dict:
        """
        Recursively replaces all occurrences of the string "null" with Python's None in the given dictionary.

        Args:
            dictionary (dict): The input dictionary.

        Returns:
            dict: A dictionary with "null" replaced by None.
        """
        for key, value in dictionary.items():
            if value == "null":
                dictionary[key] = None
            elif isinstance(value, dict):  # If the value is a dictionary, recurse.
                dictionary[key] = self.change_null(value)
            #elif isinstance(value, list):  # If the value is a list, iterate and process.
            #    dictionary[key] = [
            #        None if item == "null" else self.__change_null(item) if isinstance(item, dict) else item
            #        for item in value
            #    ]
        return dictionary
    
    def deepmerge_dicts(self, d1: dict, d2: dict) -> dict:
        merged = {}
        for key in set(d1) | set(d2):
            if key in d1 and key in d2:
                if isinstance(d1[key], dict) and isinstance(d2[key], dict):
                    merged[key] = self.deepmerge_dicts(d1[key], d2[key])
                else:
                    #merged[key] = d1[key] or d2[key]
                    #Take latest value, in case user wants to change option.
                    if d2[key] is None:
                        merged[key] = d1[key]
                    else:
                        merged[key] = d2[key]
            elif key in d1:
                merged[key] = d1[key]
            else:
                merged[key] = d2[key]
        return merged
    
    def find_dictionary(self, text: str) -> str:
        opening_bracket = [(match.start(), match.group()) for match in re.finditer(r'[{]', text)]
        opening_index = opening_bracket[0][0]

        closing_bracket = [(match.start(), match.group()) for match in re.finditer(r'[}]', text)]
        closing_index = closing_bracket[-1][0]

        dictionary_from_llm = text[opening_index:closing_index+1]
        return dictionary_from_llm

## All predictions

Needed to calculate the overall F1 and accuracy

In [13]:
predictions = []
ground_truths = []
global_exception_counter = 0

In [14]:
from itertools import product
import copy

## Intent 1 - Chicken ordering

### Template 1 - TYPE, SIZE, SAUCE

In [None]:
template = "I would like to order a {} {} chicken with{} sauce."

ds = {
    "intent": "chicken_ordering",
    "slots": {
        "chicken_type": "",
        "chicken_size": "",
        "chicken_bones": None,
        "sauce_type": "",
        "side_dish": None
    },
}
chicken_type = ["grilled", "roasted"]
chicken_size = ["small", "medium", "large"]
sauce_type = [" mushroom", " pesto", " no", "out"]

np = []
ngt = []
sentences = []
exception_counter = 0

print("Total product", len(chicken_type)*len(chicken_size)*len(sauce_type))

i = 0
for typ, size, sauce in product(chicken_type, chicken_size, sauce_type):
    i = i + 1
    filled_template = template.format(size, typ, sauce)
    ds["slots"]["chicken_type"] = typ
    ds["slots"]["chicken_size"] = size
    ds["slots"]["sauce_type"] = "none" if sauce in [" no", "out"] else sauce
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #problems with None as saucetype output nlu

### Template 2 - SIZE, BONES, SIDE

In [None]:
template = "I would like to order a {} chicken {} bones and {} side dish."

ds = {
    "intent": "chicken_ordering",
    "slots": {
        "chicken_type": None,
        "chicken_size": "",
        "chicken_bones": "",
        "sauce_type": None,
        "side_dish": ""
    },
}
chicken_size = ["small", "medium", "large"]
chicken_bones = ["with", "witout", "with no"]
side_dish = ["carrots", "smashed potatoes", "no"]

np = []
ngt = []
sentences = []
exception_counter = 0

print("Total product", len(chicken_bones)*len(chicken_size)*len(side_dish))

i = 0
for size, bone, side in product(chicken_size, chicken_bones, side_dish):
    i = i + 1
    filled_template = template.format(size, bone, side)
    ds["slots"]["chicken_size"] = size
    ds["slots"]["chicken_bones"] = "yes" if bone in ["with"] else "no"
    ds["slots"]["side_dish"] = side
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #problems with no in side dish

### Template 3 - TYPE, SIZE, BONES

In [None]:
template = "I'd like a {} chicken, size {}, and {} bones."

ds = {
    "intent": "chicken_ordering",
    "slots": {
        "chicken_type": "",
        "chicken_size": "",
        "chicken_bones": "",
        "sauce_type": None,
        "side_dish": None
    },
}

chicken_type = ["grilled", "roasted"]
chicken_size = ["small", "medium", "large"]
chicken_bones = ["with", "witout", "with no"]

np = []
ngt = []
sentences = []
exception_counter = 0

print("Total product", len(chicken_type)*len(chicken_size)*len(chicken_bones))

i = 0
for typ, size, bone in product(chicken_type, chicken_size, chicken_bones):
    i = i + 1
    filled_template = template.format(size, typ, side)
    ds["slots"]["chicken_size"] = size
    ds["slots"]["chicken_bones"] = "yes" if bone in ["with"] else "no"
    ds["slots"]["chicken_type"] = typ
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 4 - TYPE, BONES, SAUCE

In [None]:
template = "Please prepare a {} chicken {} bones and with{} sauce."

ds = {
    "intent": "chicken_ordering",
    "slots": {
        "chicken_type": "",
        "chicken_size": None,
        "chicken_bones": "",
        "sauce_type": "",
        "side_dish": None
    },
}
chicken_type = ["grilled", "roasted"]
chicken_bones = ["with", "witout", "with no"]
sauce_type = [" mushroom", " pesto", " no", "out"]

np = []
ngt = []
sentences = []
exception_counter = 0

print("Total product", len(chicken_type)*len(chicken_bones)*len(sauce_type))

i = 0
for typ, bone, sauce in product(chicken_type, chicken_bones, sauce_type):
    i = i + 1
    filled_template = template.format(typ, bone, sauce)
    ds["slots"]["sauce_type"] = "none" if sauce in [" no", "out"] else sauce
    ds["slots"]["chicken_bones"] = "yes" if bone in ["with"] else "no"
    ds["slots"]["chicken_type"] = typ
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

## Intent 2 - drink ordering

In [23]:
drink_type = ["Coca Cola", "Fanta", "Sprite", "Water"]
drink_size = ["small", "medium", "large"]
drink_ice = ["", " no", "out"]

### Template 5 - TYPE, SIZE

In [None]:
template = "I would like a {} {}."

ds = {
    "intent": "drink_ordering",
    "slots": {
        "drink_type": "",
        "drink_size": "",
        "ice": None
    },
}


print("Total product", len(drink_type)*len(drink_size))

np = []
ngt = []
sentences = []
exception_counter = 0

i = 0
for typ, size in product(drink_type, drink_size):
    i = i + 1
    filled_template = template.format(size, typ)
    ds["slots"]["drink_type"] = typ
    ds["slots"]["drink_size"] = size
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #no problems

### Template 6 - TYPE, ICE

In [None]:
template = "I would like a {} with{} ice."

ds = {
    "intent": "drink_ordering",
    "slots": {
        "drink_type": "",
        "drink_size": None,
        "ice": ""
    },
}


print("Total product", len(drink_type)*len(drink_ice))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, ice in product(drink_type, drink_ice):
    i = i + 1
    filled_template = template.format(typ, ice)
    ds["slots"]["drink_type"] = typ
    ds["slots"]["ice"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #no problems

### Template 7 - SIZE, ICE

In [None]:
template = "I need a {} drink with{} ice."

ds = {
    "intent": "drink_ordering",
    "slots": {
        "drink_type": None,
        "drink_size": "",
        "ice": ""
    },
}


print("Total product", len(drink_size)*len(drink_ice))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for size, ice in product(drink_size, drink_ice):
    i = i + 1
    filled_template = template.format(size, ice)
    ds["slots"]["drink_size"] = size
    ds["slots"]["ice"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 8 - TYPE, SIZE, ICE

In [None]:
template = "Please give me a {} {} with{} ice."

ds = {
    "intent": "drink_ordering",
    "slots": {
        "drink_type": "",
        "drink_size": "",
        "ice": ""
    },
}


print("Total product", len(drink_type)*len(drink_size)*len(drink_ice))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, size, ice in product(drink_type, drink_size, drink_ice):
    i = i + 1
    filled_template = template.format(typ, size, ice)
    ds["slots"]["drink_type"] = typ
    ds["slots"]["drink_size"] = size
    ds["slots"]["ice"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)
f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

## Intent 3 - dessert ordering

In [32]:
dessert_type = ["tiramisu", "ice cream", "apple crumble pie", "waffles", "smoothie"]
extra_whipped_cream = ["", "out", " no"]

### Template 9 - TYPE, CREAM

In [None]:
template = "I would like a {} with{} whipped cream."

ds = {
    "intent": "dessert_ordering",
    "slots": {
        "dessert_type": "",
        "extra_whipped_cream": ""
    },
}


print("Total product", len(dessert_type)*len(extra_whipped_cream))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, ice in product(dessert_type, extra_whipped_cream):
    i = i + 1
    filled_template = template.format(typ, ice)
    ds["slots"]["dessert_type"] = typ
    ds["slots"]["extra_whipped_cream"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)
f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #no problems

### Template 10 - TYPE

In [None]:
template = "For dessert, I would like a {}."

ds = {
    "intent": "dessert_ordering",
    "slots": {
        "dessert_type": "",
        "extra_whipped_cream": None
    },
}


print("Total product", len(dessert_type))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ in dessert_type:
    i = i + 1
    filled_template = template.format(typ)
    ds["slots"]["dessert_type"] = typ
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 11 - TYPE, CREAM

In [None]:
template = "Please bring me a {} with{} whipped cream."

ds = {
    "intent": "dessert_ordering",
    "slots": {
        "dessert_type": "",
        "extra_whipped_cream": ""
    },
}


print("Total product", len(dessert_type)*len(extra_whipped_cream))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, ice in product(dessert_type, extra_whipped_cream):
    i = i + 1
    filled_template = template.format(typ, ice)
    ds["slots"]["dessert_type"] = typ
    ds["slots"]["extra_whipped_cream"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)
f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 12 - TYPE, CREAM

In [None]:
template = "To finish my dinner, I want to have {} with{} whipped cream."

ds = {
    "intent": "dessert_ordering",
    "slots": {
        "dessert_type": "",
        "extra_whipped_cream": ""
    },
}


print("Total product", len(dessert_type)*len(extra_whipped_cream))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, ice in product(dessert_type, extra_whipped_cream):
    i = i + 1
    filled_template = template.format(typ, ice)
    ds["slots"]["dessert_type"] = typ
    ds["slots"]["extra_whipped_cream"] = "yes" if ice == "" else "no"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

## Intent 4 - table reservation

In [40]:
def word_to_num(word):
    words_to_numbers = {
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "ten": 10,
        "twenty": 20
    }
    return words_to_numbers.get(word.lower(), None)

### Template 13 - SIZE

In [None]:
template = "I would like to reserve a table for {}."

ds = {
    "intent": "table_reservation",
    "slots": {
        "table_type": None,
        "table_size": "",
        "sitting_equipment": None,
        "birthday_surprise": None
    }
}
table_size = ["two", "three", "three persons", "one", "six", "4", "3", "5", "10", "twenty persons", "five persons"]



print("Total product", len(table_size))

i = 0
np = []
ngt = []
sentences = []
exception_counter

for size in table_size:
    i = i + 1
    filled_template = template.format(size)
    if size.isdigit():
        ds["slots"]["table_size"] = str(int(size))
    elif "persons" in size:
        size = size.strip("persons").strip(" ")
        ds["slots"]["table_size"] = str(word_to_num(size))
    else:
        ds["slots"]["table_size"] = str(word_to_num(size))
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences) #numbering, should be fix in rerun

### Template 14 - TYPE, SIZE, EQUIPMENT

In [None]:
template = "I would like to reserve a {} table for {} {}."

ds = {
    "intent": "table_reservation",
    "slots": {
        "table_type": "",
        "table_size": "",
        "sitting_equipment": "",
        "birthday_surprise": None
    }
}

table_type = ["normal", "business", "romantic"]
table_size = ["4", "5", "10"]
sitting_equipment = ["with chairs", "with luxury benches", "the sitting does not matter"]

print("Total product", len(table_type)*len(table_size)*len(sitting_equipment))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, size, sit in product(table_type, table_size, sitting_equipment):
    i = i + 1
    filled_template = template.format(typ, size, sit)
    ds["slots"]["table_size"] = size
    ds["slots"]["table_type"] = typ

    if sit == "with chairs":
        ds["slots"]["sitting_equipment"] = "chair(s)"
    elif sit == "with luxury benches":
        ds["slots"]["sitting_equipment"] = "bench(es)"
    else:
        ds["slots"]["sitting_equipment"] = "does not matter"
        
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 15 - EQUIPMENT, SIZE, BIRTHDAY

In [None]:
template = "I would like to reserve a table {} for {} persons.{}"

ds = {
    "intent": "table_reservation",
    "slots": {
        "table_type": None,
        "table_size": "",
        "sitting_equipment": "",
        "birthday_surprise": ""
    }
}

table_size = ["1", "22", "6"]
sitting_equipment = ["with chairs", "with luxury benches", "the sitting does not matter"]
birthday_surprise = [" We would like a birthday surprise.", " We are celebrating a birthday.", " Please prepare a birthday surprise", "", " We do not want a birthday surprise."]

print("Total product", len(table_size)*len(sitting_equipment)*len(birthday_surprise))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for size, sit, birthday in product(table_size, sitting_equipment, birthday_surprise):
    i = i + 1
    filled_template = template.format(sit, size, birthday)
    ds["slots"]["table_size"] = size
    ds["slots"]["birthday_surprise"] = "no" if birthday in ["", " We do not want a birthday surprise."] else "yes"
    if sit == "with chairs":
        ds["slots"]["sitting_equipment"] = "chair(s)"
    elif sit == "with luxury benches":
        ds["slots"]["sitting_equipment"] = "bench(es)"
    else:
        ds["slots"]["sitting_equipment"] = "does not matter"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 16 - SIZE, TYPE

In [None]:
template = "Please reserve for {} persons a {} table"

ds = {
    "intent": "table_reservation",
    "slots": {
        "table_type": "",
        "table_size": "",
        "sitting_equipment": None,
        "birthday_surprise": None
    }
}

table_type = ["normal", "business", "romantic"]
table_size = ["4", "5", "10", "8", "2"]

print("Total product", len(table_type)*len(table_size))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, size in product(table_type, table_size):
    i = i + 1
    filled_template = template.format(typ, size)
    ds["slots"]["table_size"] = size
    ds["slots"]["table_type"] = typ
        
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

## Intent 5 - appetizer ordering

### Template 17 - TYPE, MOMENT

In [None]:
template = "I want {} {} minutes before my main course."

ds = {
    "intent": "appetizer_ordering",
    "slots": {
        "appetizer_type": None,
        "appetizer_moment": None
    }
}

appetizer_type = ["crisps", "simple salad", "deluxe salad", "tomato soup", "onion soup"]
appetizer_moment = ["4", "5", "10", "8", "2"]

print("Total product", len(appetizer_type)*len(appetizer_moment))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, moment in product(appetizer_type, appetizer_moment):
    i = i + 1
    filled_template = template.format(typ, moment)
    ds["slots"]["appetizer_type"] = typ
    ds["slots"]["appetizer_moment"] = moment
        
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

### Template 18 - MOMENT, TYPE

In [None]:
template = "{} minutes before the main course, I would like {}."

ds = {
    "intent": "appetizer_ordering",
    "slots": {
        "appetizer_type": None,
        "appetizer_moment": None
    }
}

appetizer_type = ["crisps", "simple salad", "deluxe salad", "tomato soup", "onion soup"]
appetizer_moment = ["4", "5", "10", "8", "2"]

print("Total product", len(appetizer_type)*len(appetizer_moment))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for typ, moment in product(appetizer_type, appetizer_moment):
    i = i + 1
    filled_template = template.format(moment, typ)
    ds["slots"]["appetizer_type"] = typ
    ds["slots"]["appetizer_moment"] = moment
        
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        global_exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

f1 = evaluate_slot_predictions(ngt, np)
print(f"F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ngt, np)
acc = correct/(len(ngt)+exception_counter)
print(f"Intent Accuracy: {acc:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

## Overall scores before we do the request_information and out of domain

In [None]:
print("Grand total", len(ground_truths))
print("Attempted", len(ground_truths)+global_exception_counter)

f1 = evaluate_slot_predictions(ground_truths, predictions)
print(f"OVERALL F1-score: {f1:.2f}")

correct = evaluate_intent_accuracy(ground_truths, predictions)
acc = correct/(len(ground_truths)+global_exception_counter)
print(f"OVERALL intent Accuracy: {acc:.2f}")

## Intent 6 - request information

In [None]:
template = "Which {} {}?"

ds = {}

requests = ["drinks", "beverages", "chickens", "tables", "side dishes", "sauces for the chicken", "drink sizes", "desserts", "sitting equipment"]
look_up = ["drink_ordering", "drink_ordering", "chicken_ordering", "table_reservation", "chicken_ordering", "chicken_ordering", "drink_ordering", "dessert_ordering", "table_reservation"]
technique = ["do you offer", "do you have available", "do you have", "are there", "are there available"]

print("Total product", len(requests)*len(technique))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for req, te in product(requests, technique):
    i = i + 1
    filled_template = template.format(req, te)
    ds["intent"] = f"request_information({str(look_up[(i-1)//len(technique)])})"
    
    nlu_text = chat_template.format(NLU_PROMPT, filled_template)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(filled_template)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter += 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", filled_template)
        print()
print("Total:", i)

correct, correct_info = evaluate_intent_accuracy_phrase(ngt, np, "request_information")

#Now account for the once that crashes:
acc1 = correct/(len(ngt)+exception_counter)
acc2 = correct_info/(len(ngt)+exception_counter)

print(f"Accuracy on requesting information: {acc1:.2f}")
print(f"Accuracy on requesting information intent: {acc2:.2f}")

In [None]:
output_analysis(ngt, np, sentences)

## Intent 7 - out of domain

In [None]:
examples = [
    "I would like to order pizza.",
    "I would like to order fries.",
    "What is the cheapest pizza you have?",
    "What are good examples of Italian dishes?",
    "I do not like you.",
    "Shut up.",
    "Please help me with my project for Human-Machine Dialogue.",
    "Is Apple a good brand?",
    "Is comic sans a good text font?",
    "Would you like to go out with me?",
    "I want to order a notebook.",
    "Tell me some interesting facts about yourself.",
    "I want to cry",
    "How can I take the bus to university?",
    "What are recommendations when building a NLU for a human-machine dialogue system?",
    "What is the time?",
    "Kun je in het Nederlands chatten?",
    "Va bene ragazzi, vorrie mangiare un pollo con patate.",
    "Is Kaggle better than Google Colab?",
    "Please make a custom order for my chicken."
]

ds = {}

print("Total product", len(examples))

i = 0
np = []
ngt = []
sentences = []
exception_counter = 0

for ood in examples:
    i = i + 1
    if i % 5 == 0:
        print("Done", i)
    ds["intent"] = "out_of_domain"
    
    nlu_text = chat_template.format(NLU_PROMPT, ood)
    nlu_input = tokenizer(nlu_text, return_tensors="pt").to(model.device)
    nlu_output = generate(model, nlu_input, tokenizer, max_seq_length)
    dst = DialogueStateTracker()
    try:
        dst.update_nlu_state(nlu_out=nlu_output)
        nlu_out = dst.get_state()["NLU"]
    
        np.append(nlu_out)
        ngt.append(copy.deepcopy(ds))
        sentences.append(ood)
    
        predictions.append(nlu_out)
        ground_truths.append(copy.deepcopy(ds))
    except Exception as e:
        exception_counter =+ 1
        print(f"An unexpected error occurred at iteration {i}: {e}")
        print("NLU output:", nlu_output)
        print("NLU input:", ood)
        print()
print("Total:", i)

correct = evaluate_intent_accuracy_phrase(ngt, np, "out_of_domain")
acc1 = correct/(len(ngt)+exception_counter)
print(f"Accuracy on out-of-domain: {acc1:.2f}")

In [None]:
output_analysis(ngt, np, sentences)