# Model agreement

## Setup

In [1]:
import util.local_config as local_config
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, cohen_kappa_score
import matplotlib.pyplot as plt
from util.helper_functions.classification_report_output_processon import print_classification_report

In [2]:
# Get the ground truth labels
ground_truth_labels = local_config.GROUND_TRUTH_LABELS

# From the ground truth labels, we only need the dialogue_id, utterance_id, and the ground truth emotion one hot index
ground_truth_labels = ground_truth_labels[['dialogue_id', 'utterance_id', 'ground_truth_emotion_one_hot_index']]

# Get the speech and text model results dataframes
speech_model_results = local_config.SPEECH_MODEL_RESULTS
text_model_results = local_config.TEXT_MODEL_RESULTS

# From the speech and text models, we only need the dialogue_id, utterance_id, and the predicted emotion
speech_model_results = speech_model_results[['dialogue_id', 'utterance_id', 'model_prediction']]
text_model_results = text_model_results[['dialogue_id', 'utterance_id', 'model_prediction']]

# Rename the predicted emotion column to speech_predicted_emotion and text_predicted_emotion
speech_model_results = speech_model_results.rename(columns={'model_prediction': 'speech_predicted_emotion'})
text_model_results = text_model_results.rename(columns={'model_prediction': 'text_predicted_emotion'})

# Combine both the speech and text model results
model_results = pd.merge(text_model_results, speech_model_results, on=['dialogue_id', 'utterance_id'])

# Merge the ground truth labels with results
model_results = pd.merge(model_results, ground_truth_labels, on=['dialogue_id', 'utterance_id'])

## Main

### Cohen kappa score

In [3]:
# Calculate the cohen's kappa score
cohen_kappa_score = cohen_kappa_score(model_results['text_predicted_emotion'], model_results['speech_predicted_emotion'])

# Print the cohen's kappa score
print(f'Cohen\'s kappa score between text and speech model: {cohen_kappa_score}')

Cohen's kappa score between text and speech model: 1.0


### Calculate how many times one or both models had the right prediction

In [4]:
# Get the total number of utterances
total_utterances = model_results.shape[0]

# Calculate how many times the text and speech model agreed and had the correct prediction
both_agreed_and_correct_count = model_results.loc[
    (model_results['text_predicted_emotion'] == model_results['speech_predicted_emotion']) & 
    (model_results['text_predicted_emotion'] == model_results['ground_truth_emotion_one_hot_index'])
    ].count()[0]

# Calculate how many times the text and speech model disagreed and the text model had the correct prediction
text_model_correct_count = model_results.loc[
    (model_results['text_predicted_emotion'] != model_results['speech_predicted_emotion']) &
    (model_results['text_predicted_emotion'] == model_results['ground_truth_emotion_one_hot_index'])
    ].count()[0]

# Calculate how many times the text and speech model disagreed and the speech model had the correct prediction
speech_model_correct_count = model_results.loc[
    (model_results['text_predicted_emotion'] != model_results['speech_predicted_emotion']) &
    (model_results['speech_predicted_emotion'] == model_results['ground_truth_emotion_one_hot_index'])
    ].count()[0]

# Calculate how many times the text and speech model agreed but had the wrong prediction
both_agreed_but_wrong_count = model_results.loc[
    (model_results['text_predicted_emotion'] == model_results['speech_predicted_emotion']) &
    (model_results['text_predicted_emotion'] != model_results['ground_truth_emotion_one_hot_index'])
    ].count()[0]

# Calculate how many times the text and speech model disagreed and all the wrong prediction
both_disagreed_and_wrong_count = model_results.loc[
    (model_results['text_predicted_emotion'] != model_results['speech_predicted_emotion']) &
    (model_results['text_predicted_emotion'] != model_results['ground_truth_emotion_one_hot_index']) &
    (model_results['speech_predicted_emotion'] != model_results['ground_truth_emotion_one_hot_index'])
    ].count()[0]

In [5]:
# Print the percentage results
print(f'Percentage of times the text and speech model agreed and had the correct prediction: {both_agreed_and_correct_count / total_utterances}')
print(f'Percentage of times the text and speech model disagreed and the text model had the correct prediction: {text_model_correct_count / total_utterances}')
print(f'Percentage of times the text and speech model disagreed and the speech model had the correct prediction: {speech_model_correct_count / total_utterances}')
print(f'Percentage of times the text and speech model agreed but had the wrong prediction: {both_agreed_but_wrong_count / total_utterances}')
print(f'Percentage of times the text and speech model disagreed and both had the wrong prediction: {both_disagreed_and_wrong_count / total_utterances}')

Percentage of times the text and speech model agreed and had the correct prediction: 0.30184331797235026
Percentage of times the text and speech model disagreed and the text model had the correct prediction: 0.0
Percentage of times the text and speech model disagreed and the speech model had the correct prediction: 0.0
Percentage of times the text and speech model agreed but had the wrong prediction: 0.6981566820276498
Percentage of times the text and speech model disagreed and both had the wrong prediction: 0.0
