In [None]:
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.metrics import classification_report, f1_score, accuracy_score

In [None]:
TRAINED_MODEL_NAME = "2021-12-20T17-54-59_MULT_DistilBERT"
TRAINED_MODEL_DIR = f"../models/{TRAINED_MODEL_NAME}"

# Load data
df_true = pd.read_csv(f"{TRAINED_MODEL_DIR}/labeled_messages_ground_truth.csv")
df_predicted = pd.read_csv(
    f"{TRAINED_MODEL_DIR}/labeled_messages_keywords_baseline_prediction.csv"
)

# Load mappings between labels and label ids
id2label = pickle.load(open(f"{TRAINED_MODEL_DIR}/id2label.p", "rb"))
label2id = pickle.load(open(f"{TRAINED_MODEL_DIR}/label2id.p", "rb"))

In [None]:
# Check if label mapping corresponds to the table columns
df_only_label_columns = df_true.drop(["message_hash", "content"], axis=1)
misconception_columns = list(df_only_label_columns.columns)
if len(misconception_columns) != len(list(label2id.keys())):
    print(
        "Warning: The evaluation will not work as expected because the baseline uses a "
        "different number of labels than the model it is supposed to be compared with"
    )

In [None]:
# Build boolean label vectors
true_labels_lists = df_only_label_columns.values.tolist()
true_labels_flat = [np.array(sublist) for sublist in true_labels_lists]
true_bools = [item == 1 for item in true_labels_flat]

predicted_labels_lists = df_predicted.drop(
    ["message_hash", "content"], axis=1
).values.tolist()
predicted_labels_flat = [np.array(sublist) for sublist in predicted_labels_lists]
predicted_bools = [item == 1 for item in predicted_labels_flat]

In [None]:
# Print and save classification report
print("F1-Score: ", f1_score(true_bools, predicted_bools, average="micro"))
print("Accuracy: ", accuracy_score(true_bools, predicted_bools), "\n")
clf_report = classification_report(
    true_bools, predicted_bools, target_names=misconception_columns
)
# Save report
if not os.path.isdir(f"{TRAINED_MODEL_DIR}/results/"):
    os.makedirs(f"{TRAINED_MODEL_DIR}/results/")
with open(f"{TRAINED_MODEL_DIR}/results/keywords_baseline_report.txt", "w") as f:
    f.write(clf_report)

print(clf_report)

In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use 
# id2label to gather label names
true_label_idxs, pred_label_idxs = [], []
for vals in true_bools:
    true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in predicted_bools:
    pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
# Gathering vectors of label names using id2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
    if vals:
        true_label_texts.append([id2label[val] for val in vals])
    else:
        true_label_texts.append(vals)

for vals in pred_label_idxs:
    if vals:
        pred_label_texts.append([id2label[val] for val in vals])
    else:
        pred_label_texts.append(vals)

In [None]:
# Decoding input ids to comment text
comment_texts = [message[0] for message in df_true[["content"]].values]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame(
    {
        "comment_text": comment_texts,
        "true_labels": true_label_texts,
        "pred_labels": pred_label_texts,
    }
)
comparisons_df.to_csv(
    f"{TRAINED_MODEL_DIR}/results/keywords_baseline_true_predicted_comparison.csv"
)
comparisons_df