# Notebook: Compare Model Performance

In [843]:
from scipy.stats import ttest_rel, wilcoxon
from statsmodels.stats import multitest
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import levene
import statsmodels.api as sm
from scipy import stats
import numpy as np
import json

In [844]:
def get_test_results(task_name, metric, side, n_real_a, n_synth_a, model_a, cond_a, n_real_b, n_synth_b, model_b, cond_b):
    json_path_only_real = f"../07 train models/results_json/results_{model_a}_real{n_real_a}_synth{n_synth_a}_{task_name}_{cond_a}.json"
    json_path_with_synth = f"../07 train models/results_json/results_{model_b}_real{n_real_b}_synth{n_synth_b}_{task_name}_{cond_b}.json"

    with open(json_path_only_real, 'r') as json_file:
        results_only_real = json.load(json_file)

    with open(json_path_with_synth, 'r') as json_file:
        results_with_synth = json.load(json_file)

    only_real_f1 = [res[metric]
                    for res in results_only_real["single_split_results"]]
    with_synth_f1 = [res[metric]
                     for res in results_with_synth["single_split_results"]]

    # Normal Distribution

    differences = np.array(with_synth_f1) - np.array(only_real_f1)
    statistic, p_value_normal = shapiro(differences)

    alpha_normal = 0.05

    if p_value_normal >= alpha_normal:
        # Gepaarter t-Test (einseitig)
        t_statistic, p_value = ttest_rel(
            only_real_f1, with_synth_f1, alternative=side)

        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "t-test", p_value, only_real_f1, with_synth_f1]
    else:
        # Wilcoxon Signed-Rank Test (einseitig)
        statistic, p_value = wilcoxon(
            only_real_f1, with_synth_f1, alternative=side)
        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "Wilcoxon signed-rank test", p_value, only_real_f1, with_synth_f1]

In [845]:
def holm_corr(p_values):
    reject, corrected_p_values, _, _ = multitest.multipletests(
        p_values, method='holm')
    return [corrected_p_values[i] for i in range(len(p_values))]

In [846]:
# real & real + synth & p-value & shapiro p-value & statistical test & p-value & p-value-corrected-bonferoni 

In [847]:
def round_number_p(number, n=3):
    if number < 0.001:
        result = "< .001"
    elif number < 0.01:
        result = "< .01"
    elif number < 0.05:
        result = f"{number:.3f}"
    else:
        result = f"{number:.3f}"

    return result

In [848]:
def round_number(num, decimal_places):
    formatted_num = "{:.{}f}".format(num, decimal_places)
    rounded_num_str = "{:.{}f}".format(float(formatted_num), decimal_places)
    return rounded_num_str

def add_thousand_dots(n_sample):
    if isinstance(n_sample, str):
        if '.' in n_sample:
            integer_part, decimal_part = n_sample.split('.')
            formatted_integer_part = "{:,}".format(int(integer_part))
            result = f"{formatted_integer_part}.{decimal_part}"
        else:
            result = "{:,}".format(int(n_sample))
    elif isinstance(n_sample, np.float64):
        result = "{:,}".format(round(n_sample, 1))
    else:
        result = n_sample
    
    return result

In [849]:
def format_itr_results(split_results, eval_metric):
    if eval_metric == "eval_f1_macro":
        arr = "F1 Macro Scores: "
    else:
        arr = "F1 Micro Scores: "

    arr += "[" + round_number(split_results[0] * 100, 2)
    for v in split_results[1:]:
        arr += ", " + round_number(v * 100, 2)
    return "\\textsubscript{" + arr + "]" + "}"

#### ACSA / F1 Micro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [850]:
eval_metric = "eval_f1_micro"

results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

500\textsubscript{F1 Micro Scores: [84.01, 85.96, 84.05, 86.26, 83.53, 83.43]} & 500 real + 500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Micro Scores: [85.93, 89.34, 86.12, 89.54, 84.31, 84.96]}} & 0.558 & t-test & < .01 & < .01
500\textsubscript{F1 Micro Scores: [84.01, 85.96, 84.05, 86.26, 83.53, 83.43]} & 500 real + 1000 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Micro Scores: [86.55, 88.01, 85.69, 88.28, 84.28, 86.77]}} & 0.952 & t-test & < .01 & < .01
500\textsubscript{F1 Micro Scores: [84.01, 85.96, 84.05, 86.26, 83.53, 83.43]} & 500 real + 1500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Micro Scores: [86.19, 87.47, 86.05, 87.45, 83.21, 85.27]}} & 0.115 & t-test & < .01 & < .01


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS25: `475, 975, 1,975` vs. real: `500`

In [851]:
eval_metric = "eval_f1_macro"

results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "475", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "975", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "25", "1975", "GPT-3", "fixed")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 25 real + 475 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [50.04, 57.77, 43.50, 54.08, 38.14, 52.19]}} & 0.147 & t-test & 0.989 & 0.989
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 25 real + 975 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [60.55, 68.68, 54.12, 64.35, 48.33, 67.56]}} & 0.173 & t-test & 0.159 & 0.318
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 25 real + 1975 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [60.71, 67.63, 61.37, 65.91, 50.59, 63.68]}} & 0.464 & t-test & 0.031 & 0.094


#### ACSA / F1 Macro: Llama-2-70B + LRS500: `500, 1000, 1,500` vs. real: `500`

In [852]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "Llama70B", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "500",
                            "0", "only_real", "random", "500", "1000", "Llama70B", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "Llama70B", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 500 synth (generated by Llama70B)\textsubscript{\textsubscript{F1 Macro Scores: [59.20, 69.32, 59.94, 68.39, 65.00, 67.69]}} & 0.042 & Wilcoxon signed-rank test & 0.016 & 0.031
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 1000 synth (generated by Llama70B)\textsubscript{\textsubscript{F1 Macro Scores: [63.09, 68.27, 60.75, 68.45, 57.40, 63.20]}} & 0.977 & t-test & < .01 & 0.011
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 1500 synth (generated by Llama70B)\textsubscript{\textsubscript{F1 Macro Scores: [62.34, 65.21, 56.19, 65.67, 58.28, 62.84]}} & 0.020 & Wilcoxon signed-rank test & 0.031 & 0.031


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [853]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment",eval_metric, "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [73.66, 85.70, 66.25, 87.14, 64.39, 81.68]}} & 0.608 & t-test & < .01 & < .01
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 1000 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [75.79, 84.99, 68.17, 81.40, 66.66, 77.84]}} & 0.626 & t-test & < .001 & < .001
500\textsubscript{F1 Macro Scores: [57.02, 59.83, 59.52, 57.59, 53.56, 57.27]} & 500 real + 1500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [72.24, 83.79, 73.18, 79.85, 66.32, 80.07]}} & 0.114 & t-test & < .001 & < .001


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `1000`

In [854]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", eval_metric, "less", "1000", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

1000\textsubscript{F1 Macro Scores: [68.30, 72.57, 67.98, 74.35, 70.28, 78.66]} & 500 real + 500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [73.66, 85.70, 66.25, 87.14, 64.39, 81.68]}} & 0.589 & t-test & 0.107 & 0.191
1000\textsubscript{F1 Macro Scores: [68.30, 72.57, 67.98, 74.35, 70.28, 78.66]} & 500 real + 1000 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [75.79, 84.99, 68.17, 81.40, 66.66, 77.84]}} & 0.616 & t-test & 0.095 & 0.191
1000\textsubscript{F1 Macro Scores: [68.30, 72.57, 67.98, 74.35, 70.28, 78.66]} & 500 real + 1500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [72.24, 83.79, 73.18, 79.85, 66.32, 80.07]}} & 0.857 & t-test & 0.058 & 0.174


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `2000`

In [855]:
eval_metric = "eval_f1_macro"
results = [get_test_results("aspect_category_sentiment", eval_metric, "less", "2000", "0", "only_real", "random", "500", "500", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less",
                            "2000", "0", "only_real", "random", "500", "1000", "GPT-3", "random"),
           get_test_results("aspect_category_sentiment", eval_metric, "less", "2000", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

2000\textsubscript{F1 Macro Scores: [71.78, 84.24, 69.97, 76.09, 72.13, 82.70]} & 500 real + 500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [73.66, 85.70, 66.25, 87.14, 64.39, 81.68]}} & 0.765 & t-test & 0.453 & 1.000
2000\textsubscript{F1 Macro Scores: [71.78, 84.24, 69.97, 76.09, 72.13, 82.70]} & 500 real + 1000 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [75.79, 84.99, 68.17, 81.40, 66.66, 77.84]}} & 0.523 & t-test & 0.570 & 1.000
2000\textsubscript{F1 Macro Scores: [71.78, 84.24, 69.97, 76.09, 72.13, 82.70]} & 500 real + 1500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [72.24, 83.79, 73.18, 79.85, 66.32, 80.07]}} & 0.746 & t-test & 0.563 & 1.000


In [857]:
eval_metric = "eval_f1_macro"
results = [get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("end_2_end_absa", eval_metric, "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0] + format_itr_results(res[5], eval_metric), "&", res[1] + "\\textsubscript{"+format_itr_results(res[6], eval_metric)+"}", "&", round_number_p(
        res[2], 3), "&", res[3], "&", round_number_p(res[4], 3), "&", round_number_p(holm_corr_p_values[res_idx], 3))

500\textsubscript{F1 Macro Scores: [73.87, 68.64, 72.94, 60.53, 68.64, 75.80]} & 500 real + 500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [73.98, 69.49, 74.57, 66.22, 71.40, 77.99]}} & 0.479 & t-test & 0.020 & 0.059
500\textsubscript{F1 Macro Scores: [73.87, 68.64, 72.94, 60.53, 68.64, 75.80]} & 500 real + 1000 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [71.82, 73.16, 69.42, 73.49, 69.42, 77.98]}} & 0.438 & t-test & 0.175 & 0.350
500\textsubscript{F1 Macro Scores: [73.87, 68.64, 72.94, 60.53, 68.64, 75.80]} & 500 real + 1500 synth (generated by GPT-3)\textsubscript{\textsubscript{F1 Macro Scores: [67.89, 69.03, 74.98, 72.15, 69.92, 76.05]}} & 0.253 & t-test & 0.261 & 0.350
