In [None]:
from lib.dataset_utils import *
import pickle
import numpy as np
import nltk
import string
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from lib.scores import *
from lib.plot_utils import *
from lib.models import bootstrap_test, Bert, Roberta
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
MODELS = ["Bayes", "DecisionTree", "RandomForest", "Bert", "Roberta", "SocBert", "Llama3 Zero Shot", "Llama3 Three Shot"]
DATASETS = ["GoEmotions", "TwitterData", "GoEmotionsGrouped"]
BASE_MODELS_DIR = "./checkpoints/"

GOEMOTIONS_MODELS_PATH = {
    "Bayes": f"{BASE_MODELS_DIR}GoEmotions/nb_classifier.pkl",
    "DecisionTree": f"{BASE_MODELS_DIR}GoEmotions/dt_classifier.pkl",
    "RandomForest": f"{BASE_MODELS_DIR}GoEmotions/rf_classifier.pkl",
    "Bert": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Bert.pth",
    "Roberta": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Roberta.pth",
    "SocBert": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Socbert.pth",
}

GOEMOTIONS_GROUPED_MODELS_PATH = {
    "Bayes": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_grouped_bayes.pkl",
    "DecisionTree": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_grouped_dt.pkl",
    "RandomForest": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_grouped_rt.pkl",
    "Bert": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Bert_Ekman.pth",
    "Roberta": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Grouped_Roberta.pth",
    "SocBert": f"{BASE_MODELS_DIR}GoEmotions/GoEmotions_Socbert_Ekman.pth",
}

TWITTER_MODELS_PATH = {
    "Bayes": f"{BASE_MODELS_DIR}TwitterData/nb_classifier.pkl",
    "DecisionTree": f"{BASE_MODELS_DIR}TwitterData/dt_classifier.pkl",
    "RandomForest": f"{BASE_MODELS_DIR}TwitterData/rf_classifier.pkl",
    "Bert": f"{BASE_MODELS_DIR}TwitterData/TwitterData_Bert.pth",
    "Roberta": f"{BASE_MODELS_DIR}TwitterData/TwitterData_Roberta.pth",
    "SocBert": f"{BASE_MODELS_DIR}TwitterData/TwitterData_Socbert.pth",
}

DATASET_TO_PATH_DICT = {
    "GoEmotions": GOEMOTIONS_MODELS_PATH,
    "TwitterData": TWITTER_MODELS_PATH,
    "GoEmotionsGrouped": GOEMOTIONS_GROUPED_MODELS_PATH
}

DATASET_N_LABELS = {
    "GoEmotions": 28,
    "TwitterData": 6,
    "GoEmotionsGrouped": 6
}

In [None]:
def get_goemotions(cleaning=True):
    dataset = DatasetEnum.GoEmotionsCleaned if cleaning else DatasetEnum.GoEmotions
    if cleaning:
        return load_dataset(dataset)
    else:
        return load_dataset(dataset, k_hot_encode=True)

def get_twitterdata(cleaning=True):
    dataset = DatasetEnum.TwitterDataCleaned if cleaning else DatasetEnum.TwitterData
    if cleaning:
        return load_dataset(dataset)
    else:
        return load_dataset(dataset, k_hot_encode=True)

def get_goemotions_grouped(cleaning=True):
    train_df, val_df, test_df = get_goemotions(cleaning)
    # group emotions
    train_df = goemotions_apply_emotion_mapping(train_df)
    val_df = goemotions_apply_emotion_mapping(val_df)
    test_df = goemotions_apply_emotion_mapping(test_df)
    return train_df, val_df, test_df

DATASET_LOADERS = {
    "GoEmotions": get_goemotions,
    "TwitterData": get_twitterdata,
    "GoEmotionsGrouped": get_goemotions_grouped
}

REQUIRES_CLEANING = {
    "Bayes": True,
    "DecisionTree": True,
    "RandomForest": True,
    "Bert": False,
    "Roberta": False,
    "SocBert": False,
    "Llama3 Zero Shot": True,
    "Llama3 Three Shot": True
}

def get_dataset(dataset, cleaning=True):
    return DATASET_LOADERS[dataset](cleaning)

def bayes_predict(dataset):
    # file path
    model_path = DATASET_TO_PATH_DICT[dataset]["Bayes"]
    # load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    # load test data
    _, _, test_df = get_dataset(dataset)
    # predict
    predictions = model.predict(test_df['text'])
    # transform to dataframe
    predictions_df = pd.DataFrame(predictions, columns=test_df.columns[1:])
    return predictions_df

def clean_content(text):
    lemmatizer = WordNetLemmatizer()
    # tokenize
    clean_text = nltk.word_tokenize(text)
    # pos tag
    clean_text = nltk.pos_tag(clean_text)
    TAG_MAP = {'N': 'n', 'V': 'v', 'R': 'r', 'J': 'a'}
    clean_text = [(word, TAG_MAP.get(tag[0], 'n')) for word, tag in clean_text]
    # lemmatize
    clean_text = [lemmatizer.lemmatize(word, tag) for word, tag in clean_text]
    # remove punctuation marks
    clean_text = [w for w in clean_text if w not in string.punctuation]
    return ' '.join(clean_text)

def clean_df(df, text_col, out_col):
    df[out_col] = df[text_col].apply(clean_content)
    return df

def decision_tree_predict(dataset):
    # file path
    model_path = DATASET_TO_PATH_DICT[dataset]["DecisionTree"]
    # load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    # load test data
    _, _, test_df = get_dataset(dataset)
    # apply cleaning
    predict_df = clean_df(test_df, 'text', 'text')
    # predict
    predictions = model.predict(predict_df['text'])
    # transform to dataframe
    predictions_df = pd.DataFrame(predictions, columns=predict_df.columns[1:])
    return predictions_df

def random_forest_predict(dataset):
    # file path
    model_path = DATASET_TO_PATH_DICT[dataset]["RandomForest"]
    # load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    # load test data
    _, _, test_df = get_dataset(dataset)
    # apply cleaning
    predict_df = clean_df(test_df, 'text', 'text')
    # predict
    predictions = model.predict(predict_df['text'])
    # transform to dataframe
    predictions_df = pd.DataFrame(predictions, columns=predict_df.columns[1:])
    return predictions_df

def transformer_predict(dataset, model_name):
    # file path
    model_path = DATASET_TO_PATH_DICT[dataset][model_name]
    # init basic parameters
    params = {
        "n_classes": DATASET_N_LABELS[dataset],
    }
    # load model
    model = Bert(checkpoint=model_path, model_params_dict=params)
    # load test data
    _, val_df, test_df = get_dataset(dataset, cleaning=False)
    # predict on test data
    predictions = model.predict(test_df)
    if dataset != "TwitterData":
        # predict on validation data
        predictions = model.predict(val_df)
        thresh, _ = tune_sigmoid_threshold(val_df[val_df.columns[1:]], predictions, f1_score, metric_params={"average": "macro", "zero_division": 0})
        # apply threshold
        predictions = (predictions > thresh).astype(int)
    else:
        predictions = np.argmax(predictions, axis=1)
    # transform to dataframe
    predictions_df = pd.DataFrame(predictions, columns=test_df.columns[1:])
    return predictions_df

def bert_predict(dataset):
    return transformer_predict(dataset, "Bert")

def socbert_predict(dataset):
    return transformer_predict(dataset, "SocBert")

def roberta_predict(dataset):
    return transformer_predict(dataset, "Roberta")

LLAMA_ZERO_CSV = {
    "GoEmotions": "./results/llama_predictions/llama_multi_0_predictions.csv",
    "TwitterData": "./results/llama_predictions/llama_single_0_predictions.csv",
    "GoEmotionsGrouped": './results/llama_predictions/llama_grouped_0_predictions.csv'
}

LLAMA_THREE_CSV = {
    "GoEmotions": "./results/llama_predictions/llama_multi_396_predictions.csv",
    "TwitterData": "./results/llama_predictions/llama_single_355_predictions.csv",
    "GoEmotionsGrouped": './results/llama_predictions/llama_grouped_377_predictions.csv'
}

def llama3_three_predict(dataset):
    # load predictions csv
    to_read = LLAMA_THREE_CSV[dataset]
    if to_read is None:
        return None
    return pd.read_csv(to_read, index_col=0)

def llama3_zero_predict(dataset):
    # load predictions csv
    to_read = LLAMA_ZERO_CSV[dataset]
    if to_read is None:
        return None
    return pd.read_csv(to_read, index_col=0)

PREDICTOR_DICT = {
    "Bayes": bayes_predict,
    "DecisionTree": decision_tree_predict,
    "RandomForest": random_forest_predict,
    "Bert": bert_predict,
    "Roberta": roberta_predict,
    "SocBert": socbert_predict,
    "Llama3 Zero Shot": llama3_zero_predict,
    "Llama3 Three Shot": llama3_three_predict
}

def predict(model, dataset):
    return PREDICTOR_DICT[model](dataset)

In [None]:
def get_model_scores(model, dataset):
    predictions = predict(model, dataset)
    _, _, test_df = get_dataset(dataset, cleaning=REQUIRES_CLEANING[model])
    labels_list = test_df.columns[1:]
    return get_scores_dict(predictions, test_df, labels_list)

def print_results(model_name, dataset_name, predictions_df, targets_df):
    print("----------------------------------------------------------------------")
    print(f"Model: {model_name}")
    print(f"Dataset: {dataset_name}")
    plot_multilabel_confusion_heatmap(targets_df.values, predictions_df.values, label_true=targets_df.columns, label_pred=targets_df.columns, normalize=True)
    # bar plot over classes
    plot_score_barplot(targets_df.values, predictions_df.values, targets_df.columns)
    # compute scores
    scores = get_model_scores(model_name, dataset_name)
    # print scores
    custom_classification_report(scores, targets_df.columns)
    return scores

def comparison_bar_plot(scores_dict):
    for dataset in DATASETS:
        print_dict = {}
        for score_id, score_name in zip(['f1-score', 'jaccard'], ['F1 Score', 'Jaccard']):
            for model in scores_dict[dataset].keys():
                print_dict[model] = scores_dict[dataset][model]['macro avg'][score_id]
            # create bar plot with model labels rotated by 90 degrees
            plt.figure(figsize=(10, 5))
            sns.barplot(x=list(print_dict.keys()), y=list(print_dict.values()), palette=sns.color_palette("hls", len(print_dict)))
            plt.xticks(rotation=90)
            plt.title(f"Macro {score_name} for {dataset}")
            plt.xlabel("Model")
            plt.ylabel(f"Macro {score_name}")
            plt.show()

def print_dataset_results(dataset):
    scores = {}
    for model in MODELS:
        predictions_df = predict(model, dataset)
        if predictions_df is None:
            continue
        _, _, targets_df = get_dataset(dataset, cleaning=REQUIRES_CLEANING[model])
        # sort columns to match
        predictions_df = predictions_df[targets_df.columns[1:]]
        cur_scores = print_results(model, dataset, predictions_df, targets_df[targets_df.columns[1:]])
        scores[model] = cur_scores
    return scores

## Performance comparison on test data

In [None]:
# print results for all models on all datasets
all_scores = {}
for dataset in DATASETS:
    all_scores[dataset] = print_dataset_results(dataset)

In [None]:
comparison_bar_plot(all_scores)

## Statistical testing

In [None]:
N_TESTS = 100
SAMPLE_SIZE = 200
# get best model for each dataset
best_models = {}
for dataset in DATASETS:
    best_model = max(all_scores[dataset].items(), key=lambda x: x[1]['macro avg']['f1-score'])[0]
    best_models[dataset] = best_model
    # get labels
    labels = get_dataset(dataset)[2]
    labels = labels[labels.columns[1:]]
    # get best model predictions
    best_predictions = predict(best_model, dataset)
    # compare best model with all other models according to bootstrap test
    for model in MODELS:
        if model != best_model:
            # get predictions
            predictions = predict(model, dataset)
            if predictions is None:
                continue
            print(f"Bootstrap test between {best_model} and {model} on {dataset}")
            bootstrap_test(best_predictions.values, predictions.values, labels, n_tests=N_TESTS, sample_size=SAMPLE_SIZE, metric_fun=f1_score, metric_name="Macro F1", metric_params={"average": "macro", "zero_division": 0})
            print("----------------------------------------------------------------------")