In [1]:
import scipy
from pathlib import Path
from utils.retrieve_stats import *

def get_best_stats(path, avoid_premature = 10):
    folders = list((Path.cwd() / "results" / path).glob("*"))
    results = [getstats(i) for i in folders]
    bestepochs = [find_bestepoch(i, avoid_premature=avoid_premature) for i in results]
    collective = {}

    for d, bestepoch in zip(results, bestepochs):
        for idx in ["epochwisestats", "aucstats"]:
            for key in d[idx]:
                if key not in collective.keys():
                    collective[key] = []
                collective[key].append(d[idx][key][bestepoch])        
    return collective

subsymbolic = {"sceptr": None, "tcr-bert": None}
subsymbolic = {key: get_best_stats(key) for key in subsymbolic.keys()}
symbolic = {"kidera": None, "atchley": None, "aaprop": None, "random": None}
symbolic = {key: get_best_stats(f"symbolic/{key}") for key in symbolic.keys()}
alltables = subsymbolic.copy()
alltables.update(symbolic)
keys = ["train-loss.csv", "train-acc.csv", "train-preds.csv", "test-loss.csv", "test-acc.csv", "test-preds.csv"]

In [60]:
from IPython.display import display
import pandas as pd
import numpy as np

def dataframe_to_latex(df, label):
    # Start building the LaTeX table format
    latex_table = "\\begin{table}[!h]\n    \\centering\n    \\begin{tabular}{"
    
    # Add column alignment specifiers based on the number of columns in the DataFrame
    latex_table += "l|"+ "c" * len(df.columns) + "}\n"    
    latex_table += "          & " + " & ".join(df.columns.astype(str)) + " \\\\\\hline\n"
    # Add rows from the DataFrame
    for index, row in df.iterrows():
        rw = np.round(row, 3).astype(str).replace("0.0", "$\\approx 0$").tolist()
        rw = [index] + rw
        latex_table += "         " + " & ".join(rw) + " \\\\\n"
    name = " ".join(key.replace(".csv", "").split("-")).title()
    # Finish the LaTeX table format
    latex_table += "    \\end{tabular}\n"
    latex_table += f"    \\caption{{T-Test $p$-values for {name}}}\n"
    latex_table += f"    \\label{{tab:ttest-{label}}}\n"
    latex_table += "\\end{table}"
    return latex_table

for key in keys:
    tstatistic = {}
    for ssmethod, ssvals in alltables.items():
        tstatistic[ssmethod] = {}
        for sym, symvals in alltables.items():
            if "loss" in key:
                t = scipy.stats.ttest_ind(symvals[key], ssvals[key])
            else:
                t = scipy.stats.ttest_ind(ssvals[key], symvals[key])
            if t.statistic > 0:
                tstatistic[ssmethod][sym] = t.pvalue
            else:
                tstatistic[ssmethod][sym] = 1 - t.pvalue

    df = pd.DataFrame(tstatistic)
    df.columns = ["SCEPTR", "TCR-BERT", "Kidera", "Atchley", "AAProp", "Random"]
    df.index = ["SCEPTR", "TCR-BERT", "Kidera", "Atchley", "AAProp", "Random"]
    print (dataframe_to_latex(df, key))

\begin{table}[!h]
    \centering
    \begin{tabular}{l|cccccc}
          & SCEPTR & TCR-BERT & Kidera & Atchley & AAProp & Random \\\hline
         SCEPTR & $\approx 0$ & 0.223 & 1.0 & 1.0 & 1.0 & 1.0 \\
         TCR-BERT & 0.777 & $\approx 0$ & 1.0 & 1.0 & 1.0 & 1.0 \\
         Kidera & $\approx 0$ & $\approx 0$ & $\approx 0$ & 0.857 & 0.314 & 0.89 \\
         Atchley & $\approx 0$ & $\approx 0$ & 0.143 & $\approx 0$ & 0.222 & 0.331 \\
         AAProp & $\approx 0$ & $\approx 0$ & 0.686 & 0.778 & $\approx 0$ & 0.787 \\
         Random & $\approx 0$ & $\approx 0$ & 0.11 & 0.669 & 0.213 & $\approx 0$ \\
    \end{tabular}
    \caption{T-Test $p$-values for Train Loss}
    \label{tab:ttest-train-loss.csv}
\end{table}
\begin{table}[!h]
    \centering
    \begin{tabular}{l|cccccc}
          & SCEPTR & TCR-BERT & Kidera & Atchley & AAProp & Random \\\hline
         SCEPTR & $\approx 0$ & 0.802 & 1.0 & 1.0 & 1.0 & 1.0 \\
         TCR-BERT & 0.198 & $\approx 0$ & 0.998 & 0.97 & 0.989 & 0.999 \