In [1]:
import scipy
from pathlib import Path
from utils.retrieve_stats import *

def get_best_stats(path, avoid_premature = 10):
    folders = list((Path.cwd() / "results" / path).glob("*"))
    results = [getstats(i) for i in folders]
    bestepochs = [find_bestepoch(i, avoid_premature=avoid_premature) for i in results]
    collective = {}

    for d, bestepoch in zip(results, bestepochs):
        for idx in ["epochwisestats", "aucstats"]:
            for key in d[idx]:
                if key not in collective.keys():
                    collective[key] = []
                collective[key].append(d[idx][key][bestepoch])        
    return collective

subsymbolic = {"sceptr": None, "tcr-bert": None}
subsymbolic = {key: get_best_stats(key) for key in subsymbolic.keys()}
symbolic = {"kidera": None, "atchley": None, "aaprop": None, "random": None}
symbolic = {"aaprop": None, "random": None}
symbolic = {key: get_best_stats(f"symbolic/{key}") for key in symbolic.keys()}
alltables = subsymbolic.copy()
alltables.update(symbolic)
keys = ["train-loss.csv", "train-acc.csv", "train-preds.csv", "test-loss.csv", "test-acc.csv", "test-preds.csv"]

In [16]:
from IPython.display import display
import pandas as pd
import numpy as np

def dataframe_to_latex(df, label):
    # Start building the LaTeX table format
    latex_table = "\\begin{table}[!h]\n    \\centering\n    \\begin{tabular}{"
    
    # Add column alignment specifiers based on the number of columns in the DataFrame
    latex_table += "l|"+ "c" * len(df.columns) + "}\n"    
    latex_table += "          & " + " & ".join(df.columns.astype(str)) + " \\\\\\hline\n"
    # Add rows from the DataFrame
    for index, row in df.iterrows():
        rw = np.round(row, 3).astype(str).replace("0.0", "$\\approx 0$").tolist()
        rw = [index] + rw
        latex_table += "         " + " & ".join(rw) + " \\\\\n"
    name = " ".join(key.replace(".csv", "").split("-")).title()
    # Finish the LaTeX table format
    latex_table += "    \\end{tabular}\n"
    latex_table += f"    \\caption{{T-Test $p$-values for {name}}}\n"
    latex_table += f"    \\label{{tab:ttest-{label}}}\n"
    latex_table += "\\end{table}"
    return latex_table

for key in keys:
    tstatistic = {}
    for ssmethod, ssvals in subsymbolic.items():
        tstatistic[ssmethod] = {}
        for sym, symvals in symbolic.items():
            if "loss" in key:
                t = scipy.stats.ttest_ind(symvals[key], ssvals[key], alternative = "greater")
            else:
                t = scipy.stats.ttest_ind(ssvals[key], symvals[key], alternative = "greater")

            tstatistic[ssmethod][sym] = t.pvalue

    df = pd.DataFrame(tstatistic)
    #df.columns = ["SCEPTR", "TCR-BERT", "Kidera", "Atchley", "AAProp", "Random"]
    #df.index = ["SCEPTR", "TCR-BERT", "Kidera", "Atchley", "AAProp", "Random"]
    # df.columns = ["SCEPTR", "TCR-BERT", "AAProp", "Random"]
    # df.index = ["SCEPTR", "TCR-BERT", "AAProp", "Random"]
    display(df)
    print (dataframe_to_latex(df, key))

Unnamed: 0,sceptr,tcr-bert
aaprop,0.000139,6.27636e-06
random,4e-05,5.34469e-08


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & $\approx 0$ & $\approx 0$ \\
         random & $\approx 0$ & $\approx 0$ \\
    \end{tabular}
    \caption{T-Test $p$-values for Train Loss}
    \label{tab:ttest-train-loss.csv}
\end{table}


Unnamed: 0,sceptr,tcr-bert
aaprop,0.000465,0.027363
random,6.6e-05,0.009256


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & $\approx 0$ & 0.027 \\
         random & $\approx 0$ & 0.009 \\
    \end{tabular}
    \caption{T-Test $p$-values for Train Acc}
    \label{tab:ttest-train-acc.csv}
\end{table}


Unnamed: 0,sceptr,tcr-bert
aaprop,0.000384,0.01824
random,5.1e-05,0.002675


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & $\approx 0$ & 0.018 \\
         random & $\approx 0$ & 0.003 \\
    \end{tabular}
    \caption{T-Test $p$-values for Train Preds}
    \label{tab:ttest-train-preds.csv}
\end{table}


Unnamed: 0,sceptr,tcr-bert
aaprop,0.000306,9.5e-05
random,0.000137,1.3e-05


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & $\approx 0$ & $\approx 0$ \\
         random & $\approx 0$ & $\approx 0$ \\
    \end{tabular}
    \caption{T-Test $p$-values for Test Loss}
    \label{tab:ttest-test-loss.csv}
\end{table}


Unnamed: 0,sceptr,tcr-bert
aaprop,0.004912,0.072787
random,0.010998,0.107959


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & 0.005 & 0.073 \\
         random & 0.011 & 0.108 \\
    \end{tabular}
    \caption{T-Test $p$-values for Test Acc}
    \label{tab:ttest-test-acc.csv}
\end{table}


Unnamed: 0,sceptr,tcr-bert
aaprop,0.000794,0.18936
random,0.143241,0.93364


\begin{table}[!h]
    \centering
    \begin{tabular}{l|cc}
          & sceptr & tcr-bert \\\hline
         aaprop & 0.001 & 0.189 \\
         random & 0.143 & 0.934 \\
    \end{tabular}
    \caption{T-Test $p$-values for Test Preds}
    \label{tab:ttest-test-preds.csv}
\end{table}
