# Notebook: Compare Model Performance

In [23]:
from scipy.stats import ttest_rel, wilcoxon
from statsmodels.stats import multitest
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import levene
import statsmodels.api as sm
from scipy import stats
import numpy as np
import json

In [24]:
LLMS_ENCODED = {"GPT-3": "GPT-3.5-turbo", "Llama2_70B": "llama-3-70B", "Llama3_70B": "Llama-3-70B"}

In [25]:
def round_number(num, decimal_places):
    formatted_num = "{:.{}f}".format(num, decimal_places)
    rounded_num_str = "{:.{}f}".format(float(formatted_num), decimal_places)
    return rounded_num_str

def add_thousand_dots(n_sample):
    if isinstance(n_sample, str):
        if '.' in n_sample:
            integer_part, decimal_part = n_sample.split('.')
            formatted_integer_part = "{:,}".format(int(integer_part))
            result = f"{formatted_integer_part}.{decimal_part}"
        else:
            result = "{:,}".format(int(n_sample))
    elif isinstance(n_sample, np.float64):
        result = "{:,}".format(round(n_sample, 1))
    else:
        result = n_sample
    
    return result

In [26]:
def get_test_results(task_name, metric, side, n_real_a, n_synth_a, model_a, cond_a, n_real_b, n_synth_b, model_b, cond_b):
    json_path_only_real = f"../07 train models/results_json/results_{model_a}_real{n_real_a}_synth{n_synth_a}_{task_name}_{cond_a}.json"
    json_path_with_synth = f"../07 train models/results_json/results_{model_b}_real{n_real_b}_synth{n_synth_b}_{task_name}_{cond_b}.json"

    with open(json_path_only_real, 'r') as json_file:
        results_only_real = json.load(json_file)

    with open(json_path_with_synth, 'r') as json_file:
        results_with_synth = json.load(json_file)

    only_real_f1 = [res[metric]
                    for res in results_only_real["single_split_results"]]
    with_synth_f1 = [res[metric]
                     for res in results_with_synth["single_split_results"]]

    # Normal Distribution

    differences = np.array(with_synth_f1) - np.array(only_real_f1)
    statistic, p_value_normal = shapiro(differences)

    alpha_normal = 0.05

    if p_value_normal >= alpha_normal:
        # Gepaarter t-Test (einseitig)
        t_statistic, p_value = ttest_rel(
            only_real_f1, with_synth_f1, alternative=side)

        return [add_thousand_dots(n_real_a) + " real", n_real_b + " real " + f"+ {add_thousand_dots(n_synth_b)} synth", p_value_normal, "paired t-test (one-sided)", p_value, round_number(results_only_real[metric] * 100, 2), round_number(results_with_synth[metric] * 100, 2)]
    else:
        # Wilcoxon Signed-Rank Test (einseitig)
        statistic, p_value = wilcoxon(
            only_real_f1, with_synth_f1, alternative=side)
        return [add_thousand_dots(n_real_a) + " real", n_real_b + " real " + f"+ {add_thousand_dots(n_synth_b)} synth", p_value_normal, "Wilcoxon signed-rank test (one-sided)", p_value, round_number(results_only_real[metric] * 100, 2), round_number(results_with_synth[metric] * 100, 2)]

In [27]:
def holm_corr(p_values):
    reject, corrected_p_values, _, _ = multitest.multipletests(
        p_values, method='holm')
    return [corrected_p_values[i] for i in range(len(p_values))]

In [28]:
# real & real + synth & p-value & shapiro p-value & statistical test & p-value & p-value-corrected-bonferoni 

In [29]:
def round_number_p(number, n=3):
    if number < 0.001:
        result = "< .001"
    elif number < 0.01:
        result = "< .01"
    elif number < 0.05:
        result = f"{number:.3f}"
    else:
        result = f"{number:.3f}"

    return result

In [30]:
def format_itr_results(split_results, eval_metric):
    arr = ""
    # if eval_metric == "eval_f1_macro":
    #     arr = "F1 Macro Scores: "
    # else:
    #     arr = "F1 Micro Scores: "

    arr += "[" + round_number(split_results[0] * 100, 2)
    for v in split_results[1:]:
        arr += ", " + round_number(v * 100, 2)
    return "\\textsubscript{" + arr + "]" + "}"

#### ACSA / F1 Micro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [31]:
eval_metric = "eval_f1_micro"

results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less",
                            "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{500 real} & \multirow{3}{*}{84.54}  & 500 real + 500 synth & 86.70 & 0.558 & paired t-test (one-sided) & < .01 & < .01 \\
 &  & 500 real + 1,000 synth & 86.60 & 0.952 & paired t-test (one-sided) & < .01 & < .01 \\
 &  & 500 real + 1,500 synth & 85.94 & 0.115 & paired t-test (one-sided) & < .01 & < .01 \\


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS25: `475, 975, 1,975` vs. real: `500`

In [32]:
eval_metric = "eval_f1_macro"

results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "475", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "975", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "1975", "GPT-3", "fixed")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{500 real} & \multirow{3}{*}{59.52}  & 25 real + 475 synth & 50.86 & 0.200 & paired t-test (one-sided) & 0.988 & 0.988 \\
 &  & 25 real + 975 synth & 62.56 & 0.178 & paired t-test (one-sided) & 0.172 & 0.344 \\
 &  & 25 real + 1,975 synth & 63.67 & 0.401 & paired t-test (one-sided) & 0.035 & 0.106 \\


#### ACSA / F1 Macro: llama-3-70B + LRS500: `500, 1000, 1,500` vs. real: `500`

In [33]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "Llama3_70B", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "500",
                            "0", "only_real", "random", "500", "1000", "Llama3_70B", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "Llama3_70B", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{500 real} & \multirow{3}{*}{59.52}  & 500 real + 500 synth & 70.66 & 0.884 & paired t-test (one-sided) & < .001 & < .001 \\
 &  & 500 real + 1,000 synth & 67.96 & 0.011 & Wilcoxon signed-rank test (one-sided) & 0.031 & 0.031 \\
 &  & 500 real + 1,500 synth & 66.52 & 0.076 & paired t-test (one-sided) & < .01 & < .01 \\


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [34]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment",eval_metric, "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{500 real} & \multirow{3}{*}{59.52}  & 500 real + 500 synth & 79.00 & 0.734 & paired t-test (one-sided) & < .01 & < .01 \\
 &  & 500 real + 1,000 synth & 78.42 & 0.488 & paired t-test (one-sided) & < .001 & < .001 \\
 &  & 500 real + 1,500 synth & 78.47 & 0.171 & paired t-test (one-sided) & < .001 & < .001 \\


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `1000`

In [35]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{1,000 real} & \multirow{3}{*}{74.64}  & 500 real + 500 synth & 79.00 & 0.634 & paired t-test (one-sided) & 0.117 & 0.204 \\
 &  & 500 real + 1,000 synth & 78.42 & 0.686 & paired t-test (one-sided) & 0.102 & 0.204 \\
 &  & 500 real + 1,500 synth & 78.47 & 0.775 & paired t-test (one-sided) & 0.066 & 0.198 \\


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `2000`

In [36]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "2000", "0", "only_real", "random", "500", "500", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less",
                            "2000", "0", "only_real", "random", "500", "1000", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "2000", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{2,000 real} & \multirow{3}{*}{78.86}  & 500 real + 500 synth & 79.00 & 0.870 & paired t-test (one-sided) & 0.480 & 1.000 \\
 &  & 500 real + 1,000 synth & 78.42 & 0.620 & paired t-test (one-sided) & 0.584 & 1.000 \\
 &  & 500 real + 1,500 synth & 78.47 & 0.669 & paired t-test (one-sided) & 0.592 & 1.000 \\


#### E2E-ABSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `2000`

In [37]:
eval_metric = "eval_f1_macro"
results = [get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    if res_idx == 0:
        model_baseline_print = "\\multirow{3}{*}{" + res[0] + "} & \\multirow{3}{*}{" + res[5] + "} "
    else:
        model_baseline_print = " & "
    print(model_baseline_print, "&", res[1], "&", res[6], "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3), "\\\\")

\multirow{3}{*}{500 real} & \multirow{3}{*}{70.07}  & 500 real + 500 synth & 72.27 & 0.479 & paired t-test (one-sided) & 0.020 & 0.059 \\
 &  & 500 real + 1,000 synth & 72.55 & 0.438 & paired t-test (one-sided) & 0.175 & 0.350 \\
 &  & 500 real + 1,500 synth & 71.67 & 0.253 & paired t-test (one-sided) & 0.261 & 0.350 \\
