In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import glob
from scipy.stats import ttest_ind

from tqdm.notebook import tqdm

In [None]:
suffixes_1 = ["", "_2"]
suffixes_2 = ["", "_kelex"]
dataframes = {}

for suffix_1 in suffixes_1:
    for suffix_2 in suffixes_2:
        subset_pattern = f"./data/result_*t_vs_*f{suffix_2}_15attempts_bsbbert{suffix_1}.tsv"
        # Get a list of files for the current subset pattern:
        subset_files = glob.glob(subset_pattern)
        subset_dataframes = {}
        for file_path in subset_files:
            # Extract num_true and num_false from the file name:
            num_true, num_false = map(int, (file_path.split('_')[1][0], file_path.split('_')[3][0]))
            
            df = pd.read_csv(file_path, sep='\t', decimal=",", header=0)
            subset_dataframes[(num_true, num_false)] = df
        dataframes[(suffix_1, suffix_2)] = subset_dataframes

In [None]:
dataframes[("", "")][(1, 1)]

In [None]:
colors_accuracy = "Blues"
colors_f1 = "YlOrBr"
valuefont = {'fontname': 'Libertinus Serif', 'fontweight': 'heavy'}
captionfont = {'fontname': 'Libertinus Serif'}

metrics = [("accuracy", colors_accuracy), ("f1", colors_f1)]
subset_matrices = {}

# Loop over suffixes_1 and suffixes_2
for i, suffix_1 in enumerate(suffixes_1):
    subset_matrices[suffix_1] = []
    for j, suffix_2 in enumerate(suffixes_2):
        # Get the subset dataframes
        subset_dataframes = dataframes[(suffix_1, suffix_2)]
        with_zeros = (0, 1) in subset_dataframes.keys() or (1, 0) in subset_dataframes.keys()

        # Loop over metrics (accuracy, f1)
        metric_matrices = {}
        for k, (metric, cmap) in enumerate(metrics):
            
            fig, ax = plt.subplots(1, 1, figsize=(5, 5))        
            matrices = []
            for num_true, num_false in tqdm(sorted(subset_dataframes.keys())):
                metric_values = subset_dataframes[(num_true, num_false)][metric].values
                matrices.append(metric_values)
            if with_zeros:
                matrices = [np.zeros_like(matrices[0])] + matrices
            metric_matrices[metric] = matrices

            # Create an average matrix
            average_matrix = np.array([np.nanmean(matrix) for matrix in matrices])
            average_matrix = average_matrix.reshape((
                len(set(num_true for num_true, _ in subset_dataframes.keys())),
                len(set(num_false for _, num_false in subset_dataframes.keys()))
            ))
            
            ax.ticklabel_format(useLocale=True)

            ax.imshow(average_matrix, cmap=cmap, vmin=0)
            ax.invert_yaxis()

            for m in range(average_matrix.shape[0]):
                for n in range(average_matrix.shape[1]):
                    if with_zeros and m == n == 0:
                        ax.text(0, 0, "—", ha='center', va='center', color='xkcd:almost black', **valuefont)
                    else:
                        value = average_matrix[m, n]
                        ax.text(n, m, f"{value:.2%}", ha='center', va='center', color='xkcd:almost black' if value < 0.4 else 'white', **valuefont)

            ax.set_xticks(range(average_matrix.shape[1]))
            ax.set_xticklabels(sorted(set(num_false for _, num_false in subset_dataframes.keys())), **captionfont)

            ax.set_yticks(range(average_matrix.shape[0]))
            ax.set_yticklabels(sorted(set(num_true for num_true, _ in subset_dataframes.keys())), **captionfont)
            ax.set_xlabel("Inkorrekte Beispiele", **captionfont)
            ax.set_ylabel("Korrekte Beispiele", **captionfont)

            # Adjust layout and show the plot
            plt.tight_layout()
            plt.show()
            fig.savefig(f"eval_bert{suffix_2}{suffix_1}_{metric}.pdf", bbox_inches='tight')
        subset_matrices[suffix_1].append(metric_matrices)

In [None]:
alpha = 0.05
valuefont = {'fontname': 'Libertinus Serif', 'fontweight': 'heavy'}
captionfont = {'fontname': 'Libertinus Serif'}

for suffix_1, metric_list in subset_matrices.items():
    with_zeros = suffix_1 != "_2"
    for metric in metric_list[0].keys():
        colors = colors_accuracy if metric == "accuracy" else colors_f1
        # Unpack the matrices_list into two matrices
        matrix1, matrix2 = tuple(matrix[metric] for matrix in metric_list)

        t_matrix = []
        p_matrix = []
        sig_matrix = []
        for arr1, arr2 in zip(matrix1, matrix2):
            # Remove NaN values
            arr1_no_nan = arr1[~np.isnan(arr1)]
            arr2_no_nan = arr2[~np.isnan(arr2)]
            
            # Perform a paired t-test
            t_statistic, p_value = ttest_ind(arr1_no_nan, arr2_no_nan, equal_var=False)
            
            t_matrix.append(t_statistic)
            p_matrix.append(p_value)
            sig_matrix.append(int(p_value < alpha))
            
            print(f"T-test results for suffix_1='{suffix_1}', metric='{metric}':")
            print(f"T-statistic: {t_statistic}")
            print(f"P-value: {p_value}")

            # Check the significance based on the p-value
            if p_value < alpha:
                print("The difference is statistically significant.")
            else:
                print("The difference is not statistically significant.")

            print("\n")
            
        fig, ax = plt.subplots(1, 1, figsize=(5, 5))
        
        t_matrix = np.array(t_matrix)
        p_matrix = np.array(p_matrix)
        sig_matrix = np.array(sig_matrix)
        
        t_matrix = t_matrix.reshape((
            int(np.sqrt(len(t_matrix))), int(np.sqrt(len(t_matrix)))
        ))        
        p_matrix = p_matrix.reshape((
            int(np.sqrt(len(p_matrix))), int(np.sqrt(len(p_matrix)))
        ))
        sig_matrix = sig_matrix.reshape((
            int(np.sqrt(len(sig_matrix))), int(np.sqrt(len(sig_matrix)))
        ))
        
        ax.imshow(sig_matrix, cmap=colors, vmin=0, vmax=1.5)
        ax.invert_yaxis()
        
        for m in range(sig_matrix.shape[0]):
            for n in range(sig_matrix.shape[1]):
                if with_zeros and m == n == 0:
                    ax.text(0, 0, "—", ha='center', va='center', color='xkcd:almost black', **valuefont)
                else:
                    t = t_matrix[m, n]
                    p = p_matrix[m, n]
                    if np.isnan(t) or np.isnan(p):
                        ax.text(n, m, "—*", ha='center', va='center', color='xkcd:almost black', **valuefont)
                    else:  
                        ax.text(n, m, '%.2f' % t + "\n(" + '%.1e' % p + ")", ha='center', va='center', color='xkcd:almost black' if sig_matrix[m, n] < alpha else 'white', **valuefont)
                
                    
        ax.set_xticks(range(sig_matrix.shape[1]))
        ax.set_yticks(range(sig_matrix.shape[0]))
        if with_zeros:
            ax.set_xticklabels(range(len(sig_matrix[0])), **captionfont)        
            ax.set_yticklabels(range(len(sig_matrix[0])), **captionfont)
        else:
            ax.set_xticklabels(range(1, len(sig_matrix[0])+1), **captionfont)        
            ax.set_yticklabels(range(1, len(sig_matrix[0])+1), **captionfont)
        ax.set_xlabel("Inkorrekte Beispiele", **captionfont)
        ax.set_ylabel("Korrekte Beispiele", **captionfont)
        plt.tight_layout()
        plt.show()
        fig.savefig(f"eval_bert{suffix_1}_significance_{metric}.pdf", bbox_inches='tight')
        print()