# Analysis

This notebook contains the code to reproduce the experiments presented in the paper.
For details about the methodology and the datasets used, please refer to the paper.


In [None]:
from typing import List, Tuple, Dict
import pandas as pd
import random
import numpy as np
from collections import defaultdict
from statistics import mean, stdev

ParticipantTransitionProbs = Dict[str, Dict[str, float]]

## Outcome predictor

The outcome predictor is defined as follows:

$$\hat{o} = \frac{1}{1 + \exp(-h(L, a^t_u, a^t_{CA}, p, i))}$$

where $L$ is the length of the dialogue, $a^t_u$ and $a^t_{CA}$ are the actions of the user and the conversational agent at time $t$, $p$ is the patience, and $i$ is the inclination towards goal completion. The function $h$ is defined as follows:

$$h(L, a^t_u, a^t_{CA}, p, i) = w_1 * \frac{p}{L} + w_2 * \tanh(i) * \mathbb{1}(a^t_u = \text{F}) + w_3 * \mathbb{1}(a^t_{CA} = \text{A})$$

where $w_1$, $w_2$, and $w_3$ are the weights of the features and $\mathbb{1}$ is the indicator function.


In [None]:
def predict_dialogue_outcome(
    patience: float,
    inclination: float,
    dialogue: List[str],
    weights: List[float] = [1.0, 1.0, 0.5],
) -> int:
    """Predicts the outcome of a dialogue.

    Args:
        patience: User's patience.
        inclination: User's inclination towards goal completion.
        dialogue: Dialogue to predict outcome for.
        weights: Weights for the features.

    Returns:
        Outcome of the dialogue (0: failure, 1: success).
    """
    last_user_action = None
    last_agent_action = None

    for action in reversed(dialogue):
        if last_user_action is not None and last_agent_action is not None:
            break

        if action.startswith("U_") and last_user_action is None:
            last_user_action = 1.0 if "F" in action else 0.0
        elif action.startswith("S_") and last_agent_action is None:
            last_agent_action = 1.0 if "A" in action else 0.0

    if last_user_action is None:
        last_user_action = 0.0
    if last_agent_action is None:
        last_agent_action = 0.0

    features = [
        patience / len(dialogue),
        np.tanh(inclination) * last_user_action,
        last_agent_action,
    ]
    h = sum([w * f for w, f in zip(weights, features)])
    outcome_prob = 1 / (1 + np.exp(-h))
    return 1 if outcome_prob >= 0.5 else 0

## User simulators and conversational agents

From the annotated dialogues, we can extract the transition probabilities for the user simulators and conversational agents. The table summarizes the different user simulators and conversational agents.

| Dataset     | User simulator | Conversational agent |
| ----------- | -------------- | -------------------- |
| DSTC1       | U1             | A1                   |
| DSTC2       | U2             | A2                   |
| ODE         | U3             | A3                   |
| SCS         | U4             | A4                   |
| MG-ShopDial | U5             | A5                   |


In [None]:
class UserSimulator:
    def __init__(
        self,
        name: str,
        patience: float,
        inclination: float,
        transition_probabilities: ParticipantTransitionProbs = None,
    ) -> None:
        """Initializes a user simulator.

        Args:
            name: Name of the user simulator.
            patience: User's patience.
            inclination: User's inclination towards goal completion.
            transition_probabilities: Transition probabilities. Defaults to None.
        """
        self.name = name
        self.patience = patience
        self.inclination = inclination
        self.transition_probabilities = transition_probabilities

    def add_historical_dialogues(self, dialogues: List[List[str]]) -> None:
        """Adds historical dialogues.

        Args:
            dialogues: List of dialogues.
        """
        self.historical_dialogues = dialogues
        self.historical_outcomes = [
            predict_dialogue_outcome(self.patience, self.inclination, dialogue)
            for dialogue in dialogues
        ]

    def get_user_actions(self) -> List[str]:
        """Returns the list of possible user actions."""
        user_actions = set()
        for a_action in self.transition_probabilities.keys():
            for u_action in self.transition_probabilities[a_action].keys():
                user_actions.add(u_action)
        return list(user_actions)

    def get_agent_actions(self) -> List[str]:
        """Returns the list of possible agent actions."""
        return list(self.transition_probabilities.keys()) + ["End"]

In [None]:
def preprocess_dialogues(utterances: pd.DataFrame) -> List[List[str]]:
    """Preprocesses utterances to get dialogues.

    Args:
        utterances: All utterances in dataset.

    Returns:
        List of dialogues, each dialogue is a list of utterances.
    """
    dialogues = []
    case = 0
    dialogue = []

    for _, utterance in utterances.iterrows():
        actions = np.unique(
            [a[0] for a in utterance["new"].split("+")]
        ).tolist()
        if utterance["case ID"] != case:
            dialogues.append(dialogue)
            dialogue = []
            case = utterance["case ID"]
        elif "Hello" not in utterance["new"] and "Bye" not in utterance["new"]:
            if len(dialogue) > 0 and dialogue[-1].startswith(
                f"{utterance['resource']}_"
            ):
                prev_actions = [a[-1] for a in dialogue.pop(-1).split("+")]
                actions = prev_actions + actions

            dialogue.append(
                "+".join(
                    [
                        f"{utterance['resource']}_{action[0]}"
                        for action in np.unique(actions)
                    ]
                )
            )

    dialogues = list(filter(None, dialogues))
    return dialogues

In [None]:
def get_transition_probabilities(dialogues: List[str]) -> Dict[str, float]:
    """Get transition probabilities for a list of dialogues.

    Args:
        dialogues: Dialogues where each dialogue is a string of actions.

    Returns:
        Transition probabilities for each action in the dialogues.
    """
    transitions = defaultdict(lambda: defaultdict(int))

    for dialogue in dialogues:
        for i in range(len(dialogue) - 1):
            current_action = dialogue[i]
            next_action = dialogue[i + 1]
            if i == 0:
                transitions["Start"][current_action] += 1

            transitions[current_action][next_action] += 1

        transitions[dialogue[-1]]["End"] += 1

    probabilities = {}
    for action in transitions.keys():
        total = sum(transitions[action].values())
        if total > 0:
            probabilities[action] = {
                next_action: count / total
                for next_action, count in transitions[action].items()
            }
        else:
            probabilities[action] = {}

    return probabilities


def get_participants_transition_probs(
    transition_probs: Dict[str, float]
) -> Tuple[ParticipantTransitionProbs, ParticipantTransitionProbs]:
    """Gets the transitions probabilities for each participant.

    Args:
        transition_probs: Transition probabilities for all actions.

    Returns:
        Transition probabilities for each participant.
    """
    user_transition_probs = {}
    agent_transition_probs = {}
    for state, transition in transition_probs.items():
        if state.startswith("U_"):
            agent_transition_probs[state] = transition
        elif state.startswith("S_"):
            user_transition_probs[state] = transition

    return user_transition_probs, agent_transition_probs

In [None]:
USER_SIMULATORS = {}
CONVERSATIONAL_AGENTS = {}

datasets = [
    (
        "U1",
        -0.9,
        -0.9,
        "A1",
        "data/annotated_datasets/1_dstc1_updated.csv",
    ),  # Impatient and critical user
    (
        "U2",
        0.9,
        -0.9,
        "A2",
        "data/annotated_datasets/2_dstc2_updated.csv",
    ),  # Patient and critical user
    (
        "U3",
        -0.9,
        0.9,
        "A3",
        "data/annotated_datasets/5_ode_updated.csv",
    ),  # Impatient and cooperative user
    (
        "U4",
        0.9,
        0.9,
        "A4",
        "data/annotated_datasets/4_scs_updated.csv",
    ),  # Patient and cooperative user
    (
        "U5",
        1e-5,
        1e-5,
        "A5",
        "data/annotated_datasets/6_mgshopdial_updated.csv",
    ),  # Neutral user
]

In [None]:
data_stats = {}

for user_sim_name, patience, inclination, agent, path in datasets:
    print(f"Processing {path}")
    data = pd.read_csv(path)
    data = data.dropna(subset=["new"])
    dialogues = preprocess_dialogues(data)

    # Compute statistics on the dialogues: avg. # utterance and std dev
    num_utterances = [len(dialogue) for dialogue in dialogues]
    data_stats[agent] = {
        "# dialogues": len(dialogues),
        "Avg. # utterances": mean(num_utterances),
        "Std. dev. # utterances": stdev(num_utterances),
    }

    transition_probabilities = get_transition_probabilities(dialogues)
    user_transition_probs, agent_transition_probs = (
        get_participants_transition_probs(transition_probabilities)
    )

    population = UserSimulator(
        user_sim_name, patience, inclination, user_transition_probs
    )
    population.add_historical_dialogues(dialogues)
    USER_SIMULATORS[user_sim_name] = population

    CONVERSATIONAL_AGENTS[agent] = {
        "transition_probabilities": agent_transition_probs,
    }

Dialogues statistics


In [None]:
pd.DataFrame(data_stats).transpose().style.format(precision=3)

In [None]:
del (
    data,
    dialogues,
    num_utterances,
    transition_probabilities,
    user_transition_probs,
    agent_transition_probs,
    data_stats,
)

## Generation of synthetic dialogues


In [None]:
def sample_next_action(
    current_action: str, transition_probs: ParticipantTransitionProbs
) -> str:
    """Samples the next action based on transition probabilities.

    Args:
        current_action: Current action.
        transition_probs: Transition probabilities.

    Returns:
        Next action.
    """
    next_actions = list(transition_probs[current_action].keys())
    probabilities = list(transition_probs[current_action].values())
    sampled_action = np.random.choice(next_actions, p=probabilities)
    return sampled_action


def sample_dialogue(
    agent_transition_probs: ParticipantTransitionProbs,
    user_transition_probs: ParticipantTransitionProbs,
) -> List[str]:
    """Samples a dialogue.

    Args:
        agent_transition_probs: Transition probabilities for the agent.
        user_transition_probs: Transition probabilities for the user.

    Returns:
        Dialogue as list of actions.
    """
    dialogue = []
    is_finished = False

    current_action = random.choice(
        list(agent_transition_probs.keys())
        + list(user_transition_probs.keys())
    )
    dialogue.append(current_action)
    while not is_finished:
        try:
            if current_action.startswith("U_"):
                current_action = sample_next_action(
                    current_action, agent_transition_probs
                )
            else:
                current_action = sample_next_action(
                    current_action, user_transition_probs
                )
            if current_action == "End":
                is_finished = True
                break
            dialogue.append(current_action)
        except KeyError:
            current_action = current_action.split("+")[-1]

    return dialogue


def sample_dialogues(
    agent_transition_probs: ParticipantTransitionProbs,
    user_transition_probs: ParticipantTransitionProbs,
    num_dialogues: int,
    patience: float,
    inclination: float,
) -> List[Tuple[List[str], bool]]:
    """Samples dialogues.

    Args:
        agent_transition_probs: Transition probabilities for the agent.
        user_transition_probs: Transition probabilities for the user.
        num_dialogues: Number of dialogues to sample.
        patience: User's patience.
        inclination: User's inclination towards goal completion.

    Returns:
        Dialogues with success status.
    """
    dialogues = []
    for _ in range(num_dialogues):
        dialogue = sample_dialogue(
            agent_transition_probs, user_transition_probs
        )

        success = predict_dialogue_outcome(patience, inclination, dialogue)
        dialogues.append((dialogue, success))

    return dialogues

## Metrics

This part contains the methods to compute the metrics associated to the training and evaluation objectives.


In [None]:
from scipy.spatial import distance
from rouge_score import rouge_scorer
from itertools import product

### Training

We choose to use Jensen-Shannon divergence (JSD) and ROUGE-L as metrics to assess the similarity between the user population and simulated user populations. These allow us to make an assessment at the utterance- and dialogue-level respectively.


In [None]:
def compute_jsd(
    user_policy: ParticipantTransitionProbs,
    simulated_user_policy: ParticipantTransitionProbs,
) -> float:
    """Computes Jensen-Shannon divergence between user and simulated user
    policies.

    It computes the Jensen-Shannon divergence between the transition
    probabilities for each state and then averages them. Epsilon is added to
    avoid division by zero.

    Args:
        user_policy: User policy.
        simulated_user_policy: Simulated user policy.

    Returns:
        Jensen-Shannon divergence.
    """
    epsilon = 1e-9
    total_jsd = 0.0
    for state, transitions_probabilities in user_policy.items():
        # Add epsilon to avoid division by zero
        simulated_user_policy[state] = {
            k: simulated_user_policy.get(state, {}).get(k, epsilon)
            for k in transitions_probabilities.keys()
        }

        probabilities = np.array(list(transitions_probabilities.values()))
        simulated_probabilities = np.array(
            list(simulated_user_policy[state].values())
        )

        total_jsd += distance.jensenshannon(
            probabilities, simulated_probabilities, base=2
        )
    return total_jsd / len(user_policy.keys())


def compute_rouge_score(
    historical_dialogues: List[List[str]], simulated_dialogues: List[List[str]]
) -> float:
    """Computes ROUGE-L score between historical and simulated dialogues.

    It computes the average ROUGE-L score between all pairs of historical and
    simulated dialogues.

    Args:
        historical_dialogues: Historical dialogues.
        simulated_dialogues: Simulated dialogues.

    Returns:
        ROUGE-L score.
    """
    historical_dialogues = [" ".join(d) for d in historical_dialogues]
    simulated_dialogues = [" ".join(d) for d in simulated_dialogues]
    total_score = 0.0
    scorer = rouge_scorer.RougeScorer(["rougeL"])
    pairs = list(product(historical_dialogues, simulated_dialogues))
    for h, s in pairs:
        total_score += scorer.score(h, s)["rougeL"].fmeasure
    return total_score / len(pairs)

### Evaluation

We use the success rate as the performance metric to evaluate the conversational agents.


In [None]:
def compute_success_rate(successes: List[int]) -> float:
    """Computes success rate.

    Args:
        successes: Successes.

    Returns:
        Success rate.
    """
    return sum(successes) / len(successes)

## Leave-one-out cross-validation

In this part, we perform a leave-out-one out experiment to answer the following questions: is the optimal user simulator for training also the best for evaluation, and vice versa?


In [None]:
import time

In [None]:
references = [
    ("U1", "A1"),
    ("U2", "A2"),
    ("U3", "A3"),
    ("U4", "A4"),
    ("U5", "A5"),
]
num_synthetic_dialogues = 500

results = defaultdict(dict)

for user_pop, agent in references:
    print(f"Reference {agent}")
    user_population = USER_SIMULATORS[user_pop]
    if user_pop == "U3":
        historical_success_rate = 0.92
    else:
        historical_success_rate = compute_success_rate(
            user_population.historical_outcomes
        )

    for _, user_simulator in USER_SIMULATORS.items():
        if user_pop == user_simulator.name:
            continue

        print(f"{time.ctime()} - User simulator: {user_simulator.name}")

        # Generate synthetic dialogues
        synthetic_dialogues_data = sample_dialogues(
            CONVERSATIONAL_AGENTS[agent]["transition_probabilities"],
            user_simulator.transition_probabilities,
            num_synthetic_dialogues,
            user_simulator.patience,
            user_simulator.inclination,
        )

        simulated_dialogues = []
        simulated_dialogues_success = []
        for dialogue, success in synthetic_dialogues_data:
            simulated_dialogues.append(dialogue)
            simulated_dialogues_success.append(success)

        # Compute ROUGE-L score
        rouge_l_score = compute_rouge_score(
            user_population.historical_dialogues,
            simulated_dialogues,
        )

        # Compute success rate
        success_rate = compute_success_rate(simulated_dialogues_success)

        # Absolute difference success rate
        abs_diff_success_rate = abs(success_rate - historical_success_rate)

        results[agent][user_simulator.name] = {
            "ROUGE-L": rouge_l_score,
            "Success rate": success_rate,
            "Abs. diff. success rate": abs_diff_success_rate,
        }

In [None]:
rows = []
for agent, d in results.items():
    for simulated_user, metrics in d.items():
        rows.append(
            (
                agent,
                simulated_user,
                *(value for _, value in metrics.items()),
            )
        )

summary = pd.DataFrame(
    rows,
    columns=[
        "Reference",
        "User Simulator",
        "ROUGE-L",
        "Success rate",
        "Abs. diff. success rate",
    ],
)
summary.set_index(["Reference", "User Simulator"], inplace=True)

summary.style.format(precision=3)

Jensen-Shannon divergence


In [None]:
jsd_results = defaultdict(dict)

for user1, user2 in product(USER_SIMULATORS.keys(), repeat=2):
    if user1 != user2:
        user_policy1 = USER_SIMULATORS[user1].transition_probabilities
        user_policy2 = USER_SIMULATORS[user2].transition_probabilities
        jsd = compute_jsd(user_policy1, user_policy2)
        jsd_results[user1][user2] = jsd

In [None]:
pd.DataFrame(jsd_results).sort_index().style.format(precision=3)