# Notebook: Compare Model Performance

In [580]:
from scipy.stats import ttest_rel, wilcoxon
from statsmodels.stats import multitest
import matplotlib.pyplot as plt
from scipy.stats import shapiro
from scipy.stats import levene
import statsmodels.api as sm
from scipy import stats
import numpy as np
import json

In [581]:
def get_test_results(task_name, metric, side, n_real_a, n_synth_a, model_a, cond_a, n_real_b, n_synth_b, model_b, cond_b):
    json_path_only_real = f"../07 train models/results_json/results_{model_a}_real{n_real_a}_synth{n_synth_a}_{task_name}_{cond_a}.json"
    json_path_with_synth = f"../07 train models/results_json/results_{model_b}_real{n_real_b}_synth{n_synth_b}_{task_name}_{cond_b}.json"

    with open(json_path_only_real, 'r') as json_file:
        results_only_real = json.load(json_file)

    with open(json_path_with_synth, 'r') as json_file:
        results_with_synth = json.load(json_file)

    only_real_f1 = [res[metric]
                    for res in results_only_real["single_split_results"]]
    with_synth_f1 = [res[metric]
                     for res in results_with_synth["single_split_results"]]

    # Normal Distribution

    differences = np.array(with_synth_f1) - np.array(only_real_f1)
    statistic, p_value_normal = shapiro(differences)

    alpha_normal = 0.05

    if p_value_normal >= alpha_normal:
        # Gepaarter t-Test (einseitig)
        t_statistic, p_value = ttest_rel(
            only_real_f1, with_synth_f1, alternative=side)

        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "t-test", p_value]
    else:
        # Wilcoxon Signed-Rank Test (einseitig)
        statistic, p_value = wilcoxon(
            only_real_f1, with_synth_f1, alternative=side)
        return [n_real_a, n_real_b + " real " + f"+ {n_synth_b} synth " + f"(generated by {model_b})", p_value_normal, "Wilcoxon signed-rank test", p_value]

In [582]:
def holm_corr(p_values):
    reject, corrected_p_values, _, _ = multitest.multipletests(
        p_values, method='holm')
    return [corrected_p_values[i] for i in range(len(p_values))]

In [583]:
# real & real + synth & p-value & shapiro p-value & statistical test & p-value & p-value-corrected-bonferoni 

#### ACSA: GPT-3.5-turbo + LRS25: `475, 975, 1,975` vs. real: `500`

In [585]:
results = [get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "475", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "975", "GPT-3", "fixed"), 
get_test_results("aspect_category_sentiment", "eval_f1_macro", "less", "500", "0", "only_real", "random", "25", "1975", "GPT-3", "fixed")]

p_values = [p[4] for p in results]
holm_corr_p_values = holm_corr(p_values)

for res_idx, res in enumerate(results):
    print(res[0], "&", res[1], "&", round(res[2], 3), "&", res[3], "&", round(res[4], 3), "&", round(holm_corr_p_values[res_idx], 3))

500 & 25 real + 475 synth (generated by GPT-3) & 0.147 & t-test & 0.989 & 0.989
500 & 25 real + 975 synth (generated by GPT-3) & 0.173 & t-test & 0.159 & 0.318
500 & 25 real + 1975 synth (generated by GPT-3) & 0.464 & t-test & 0.031 & 0.094
