# Notebook: Compare Model Performance

In [629]:
from scipy.stats import ttest_rel, wilcoxon
from statsmodels.stats import multitest
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import levene
import statsmodels.api as sm
from scipy import stats
import numpy as np
import json

In [630]:
def get_test_results(task_name, metric, side, n_real_a, n_synth_a, model_a, cond_a, n_real_b, n_synth_b, model_b, cond_b):
    json_path_only_real = f"../07 train models/results_json/results_{model_a}_real{n_real_a}_synth{n_synth_a}_{task_name}_{cond_a}.json"
    json_path_with_synth = f"../07 train models/results_json/results_{model_b}_real{n_real_b}_synth{n_synth_b}_{task_name}_{cond_b}.json"

    with open(json_path_only_real, 'r') as json_file:
        results_only_real = json.load(json_file)

    with open(json_path_with_synth, 'r') as json_file:
        results_with_synth = json.load(json_file)

    only_real_f1 = [res[metric]
                    for res in results_only_real["single_split_results"]]
    with_synth_f1 = [res[metric]
                     for res in results_with_synth["single_split_results"]]

    # Normal Distribution

    differences = np.array(with_synth_f1) - np.array(only_real_f1)
    statistic, p_value_normal = shapiro(differences)

    alpha_normal = 0.05

    if p_value_normal >= alpha_normal:
        # Gepaarter t-Test (einseitig)
        t_statistic, p_value = ttest_rel(
            only_real_f1, with_synth_f1, alternative=side)

        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "t-test", p_value]
    else:
        # Wilcoxon Signed-Rank Test (einseitig)
        statistic, p_value = wilcoxon(
            only_real_f1, with_synth_f1, alternative=side)
        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "Wilcoxon signed-rank test", p_value]

In [631]:
def holm_corr(p_values):
    reject, corrected_p_values, _, _ = multitest.multipletests(
        p_values, method='holm')
    return [corrected_p_values[i] for i in range(len(p_values))]

In [632]:
# real & real + synth & p-value & shapiro p-value & statistical test & p-value & p-value-corrected-bonferoni 

In [633]:
def round_number(number, n=3):
    if number < 0.001:
        result = "< .001"
    elif number < 0.01:
        result = "< .01"
    elif number < 0.05:
        result = f"{number:.2f}"
    else:
        result = f"{number:.2f}"

    return result

#### ACSA / F1 Macro: GPT-3.5-turbo + LRS25: `475, 975, 1,975` vs. real: `500`

In [634]:
results = [get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "475", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "975", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "1975", "GPT-3", "fixed")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0], "&", res[1], "&", round_number(res[2], 3), "&", res[3], "&", round_number(res[4], 3), "&", round_number(holm_corr_p_values[res_idx], 3))

500 & 25 real + 475 synth (generated by GPT-3) & 0.15 & t-test & 0.99 & 0.99
500 & 25 real + 975 synth (generated by GPT-3) & 0.17 & t-test & 0.16 & 0.32
500 & 25 real + 1975 synth (generated by GPT-3) & 0.46 & t-test & 0.03 & 0.09


#### ACSA / F1 Micro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [635]:
results = [get_test_results("aspect_category_sentiment", "eval_f1_micro", "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", "eval_f1_micro", "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", "eval_f1_micro", "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0], "&", res[1], "&", round_number(res[2], 3), "&", res[3], "&", round_number(res[4], 3), "&", round_number(holm_corr_p_values[res_idx], 3))

500 & 500 real + 500 synth (generated by GPT-3) & 0.56 & t-test & < .01 & < .01
500 & 500 real + 1000 synth (generated by GPT-3) & 0.95 & t-test & < .01 & < .01
500 & 500 real + 1500 synth (generated by GPT-3) & 0.12 & t-test & < .01 & < .01


#### ACSA / F1 Macro: GPT-3.5-turbo + LRS500: `500, 1000, 1,500` vs. real: `500`

In [636]:
results = [get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "500", "500", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "500", "1000", "GPT-3", "random"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "500", "1500", "GPT-3", "random")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0], "&", res[1], "&", round_number(res[2], 3), "&", res[3], "&", round_number(res[4], 3), "&", round_number(holm_corr_p_values[res_idx], 3))

500 & 500 real + 500 synth (generated by GPT-3) & 0.61 & t-test & < .01 & < .01
500 & 500 real + 1000 synth (generated by GPT-3) & 0.63 & t-test & < .001 & < .001
500 & 500 real + 1500 synth (generated by GPT-3) & 0.11 & t-test & < .001 & < .001
