In [1]:
import os
import glob
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
def visualize(results_df: pd.DataFrame):
    total_observations = len(results_df.subject.unique())
    res_df = results_df.copy()
    res_df["total_dice_loss"] = 1 - res_df["total_dice"]
    res_df["array_yield_loss"] = 1 - res_df["array_yield"]
    dvs = ["cost", "total_dice_loss", "array_yield_loss", "total_HD"]
    dv_names = ["Cost", "Dice loss (1-dice)", "Yield loss (1-yield)", "Hellinger distance"]
    
    fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(16, 6), sharex=True)
    for i, dv in enumerate(dvs):
        sns.pointplot(x="array", y=dv, hue="hemisphere", data=res_df, ax=axes[i])
        axes[i].set_title(f"Cumulative {dv_names[i].lower()}", fontsize=16)
        axes[i].set_xlabel("", fontsize=16)
        axes[i].set_ylabel(dv_names[i], fontsize=16)
        axes[i].legend(title="Hemisphere")
    fig.suptitle(f"Mean cumulative losses", fontsize=24) #based on a total of {total_observations}
    fig.supxlabel("Array", fontsize=16)
    plt.tight_layout()
    plt.savefig(f"./losses.png")
    plt.show()

In [3]:
def get_data_hem(max_arrays: int):
    """        
    Returns
    -------
    out_df : pd.DataFrame
        A dataframe to be put in the AnovaRM function with columns:
        ["subject", "hemisphere", "array", "total_dice", "prop_total_dice", 
        "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
    """

    results_path = "/home/odysseas/Desktop/UU/thesis/BayesianOpt/5_arrays_10x10x10/results/"   
    sub_list = os.listdir(results_path)
    out_df = pd.DataFrame()
    arrays = [i for i in range(1, max_arrays + 1)]
    for sub in sub_list:
        if "exp" in sub_list:
            sub_list.remove("exp")
        if "fsaverage" in sub_list:
            sub_list.remove("fsaverage")
        # hem_df = pd.DataFrame()
        # both_hems = 0
        for hem in ["LH", "RH"]:
            hem_dir = os.path.join(results_path, sub, hem)
            filenames = glob.glob(os.path.join(hem_dir, "*.csv"))
            # Assuming there's only one file in the directory, you can take the first one
            filename = [file for file in filenames if "best" in file][0]
            try:
                res_df = pd.read_csv(filename)
                columns_to_select = ["array", "total_dice", "prop_total_dice", "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
                max_array_exists = arrays[-1] in res_df["array"].tolist()
                if max_array_exists:
                    # both_hems += 1
                    for array in arrays:
                        arr_row = res_df[res_df["array"] == array]
                        selected_columns = arr_row[columns_to_select].copy()
                        selected_columns["subject"] = sub
                        selected_columns["hemisphere"] = hem
                        column_order = ["subject", "hemisphere", "array", "total_dice", "prop_total_dice", "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
                        selected_columns = selected_columns[column_order]
                        out_df = pd.concat([out_df, selected_columns], ignore_index=True)
                # if both_hems == 2:
                #     out_df = pd.concat((out_df, hem_df), ignore_index=True)
            except FileNotFoundError as e:
                print(f"File {filename} not found")
                continue

    return out_df

In [4]:
def get_data(max_arrays: int):
    """        
    Returns
    -------
    out_df : pd.DataFrame
        A dataframe to be put in the AnovaRM function with columns:
        ["subject", "hemisphere", "array", "total_dice", "prop_total_dice", 
        "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
    """

    results_path = "/home/odysseas/Desktop/UU/thesis/BayesianOpt/5_arrays_10x10x10/results/"   
    sub_list = os.listdir(results_path)
    out_df = pd.DataFrame()
    arrays = [i for i in range(1, max_arrays + 1)]
    for sub in sub_list:
        if "exp" in sub_list:
            sub_list.remove("exp")
        if "fsaverage" in sub_list:
            sub_list.remove("fsaverage")
        hem_df = pd.DataFrame()
        both_hems = 0
        for hem in ["LH", "RH"]:
            hem_dir = os.path.join(results_path, sub, hem)
            filenames = glob.glob(os.path.join(hem_dir, "*.csv"))
            # Assuming there's only one file in the directory, you can take the first one
            filename = [file for file in filenames if "best" in file][0]
            try:
                res_df = pd.read_csv(filename)
                columns_to_select = ["array", "total_dice", "prop_total_dice", "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
                max_array_exists = arrays[-1] in res_df["array"].tolist()
                if max_array_exists:
                    both_hems += 1
                    for array in arrays:
                        arr_row = res_df[res_df["array"] == array]
                        selected_columns = arr_row[columns_to_select].copy()
                        selected_columns["subject"] = sub
                        selected_columns["hemisphere"] = hem
                        column_order = ["subject", "hemisphere", "array", "total_dice", "prop_total_dice", "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]
                        selected_columns = selected_columns[column_order]
                        hem_df = pd.concat([hem_df, selected_columns], ignore_index=True)
                if both_hems == 2:
                    out_df = pd.concat((out_df, hem_df), ignore_index=True)
            except FileNotFoundError as e:
                print(f"File {filename} not found")
                continue

    return out_df

In [5]:
# these only have the subs that have completed max array in BOTH hemispheres
array1_data = get_data(max_arrays=1)
array2_data = get_data(max_arrays=2)
array3_data = get_data(max_arrays=3)
array4_data = get_data(max_arrays=4)
all_arrays_data = get_data(max_arrays=5)
list_all_array_dfs = [array1_data, array2_data, array3_data, array4_data, all_arrays_data]

In [6]:
# these can have different number of subs for each max array per hemisphere
hem_array1_data = get_data_hem(max_arrays=1)
hem_array2_data = get_data_hem(max_arrays=2)
hem_array3_data = get_data_hem(max_arrays=3)
hem_array4_data = get_data_hem(max_arrays=4)
all_hem_arrays_data = get_data_hem(max_arrays=5)
list_all_hem_array_dfs = [hem_array1_data, hem_array2_data, hem_array3_data, hem_array4_data, all_hem_arrays_data]

In [7]:
def get_descriptive_stats(all_arrays_df: list, hem: str):
    descriptive_stats_df = pd.DataFrame()
    for df in all_arrays_df:
        hem_df = df[df["hemisphere"] == hem]
        max_array = hem_df.array.max()
        total_subs = len(hem_df.subject.unique())
        stats = hem_df.groupby(["hemisphere", "array"])[["total_dice", "prop_total_dice", "array_yield", "total_HD", "prop_total_hd", "cost", "prop_cost"]].mean().reset_index()
        stats = stats[stats["array"] == max_array].reset_index(drop=True)
        stats["total_subjects"] = total_subs
        stats["total_dice_loss"] = 1 - stats["total_dice"]
        stats["array_yield_loss"] = 1 - stats["array_yield"]
        column_order = ["array", "total_subjects", "hemisphere",  "total_dice_loss", "prop_total_dice", "array_yield_loss", "total_HD", "prop_total_hd", "cost", "prop_cost"]
        stats = stats[column_order]
        descriptive_stats_df = pd.concat([descriptive_stats_df, stats], ignore_index=True)
    
    return descriptive_stats_df

In [8]:
descriptives_LH = get_descriptive_stats(list_all_array_dfs, "LH")
descriptives_RH = get_descriptive_stats(list_all_array_dfs, "RH")

In [57]:
descriptives_LH

In [58]:
descriptives_RH

In [59]:
descriptives_hem_LH = get_descriptive_stats(list_all_hem_array_dfs, "LH")
descriptives_hem_RH = get_descriptive_stats(list_all_hem_array_dfs, "RH")

In [71]:
round(descriptives_hem_LH, 2)

In [70]:
round(descriptives_hem_RH, 2)

In [9]:
visualize(all_hem_arrays_data)

In [38]:
arr_df = all_arrays_data[(all_arrays_data["array"] == 2) & (all_arrays_data["hemisphere"] == "LH")].reset_index(drop=True)
cost_std = arr_df["cost"].std()
arr_df["std_cost"] = (arr_df["cost"] - arr_df["cost"].mean()) / cost_std
arr_df

In [73]:
all_hem_arrays_data_LH = all_hem_arrays_data[all_hem_arrays_data["hemisphere"] == "LH"]
all_hem_arrays_data_RH = all_hem_arrays_data[all_hem_arrays_data["hemisphere"] == "RH"]

In [75]:
def create_histograms(all_arrays_data_LH, all_arrays_data_RH):
    row = 0
    col = 0
    colors = ["green", "skyblue", "olive", "gold", "teal"]
    fig, axes = plt.subplots(2, 5, figsize=(12, 8))
    for hem, df in zip(["left", "right"], [all_arrays_data_LH, all_arrays_data_RH]):
        for arr in range(1, 6):
            arr_df = df[df["array"] == arr].reset_index(drop=True).copy()
            sns.histplot(data=arr_df, x="cost", kde=False, color=colors[col], ax=axes[row, col])
            axes[row, col].set_xlabel(f"Cost for array {arr}", fontsize=14) if row == 1 else axes[row, col].set_xlabel("")
            axes[row, col].set_ylabel("")
            col += 1
        row += 1
        col = 0
    
    labels = ["Count for left hemisphere", "Count for right hemisphere"]
    for l, ax in zip(labels, axes):
        ax[0].set_ylabel(l, fontsize=14)
        
    fig.suptitle(f"Distribution of cost per array and hemisphere", fontsize=24)
    plt.tight_layout()
    plt.savefig(f"./histograms.png")
    plt.show()

In [76]:
create_histograms(all_hem_arrays_data_LH, all_hem_arrays_data_RH)

In [68]:
for arr in range(1, 6):
    for arr_next in range(arr+1, 6):
        arr_df_first = all_hem_arrays_data_LH[all_hem_arrays_data_LH["array"] == arr].reset_index(drop=True)
        arr_df_second = all_hem_arrays_data_LH[all_hem_arrays_data_LH["array"] == arr_next].reset_index(drop=True)
        
        arr_df_second["dif"] = arr_df_first["cost"] - arr_df_second["cost"]
        dif_std = arr_df_second["dif"].std()
        
        print("Array pair:", arr, arr_next)
        print("mean dif and srd:", round(arr_df_second["dif"].mean(), 4), round(dif_std, 4))

In [69]:
for arr in range(1, 6):
    for arr_next in range(arr+1, 6):
        arr_df_first = all_hem_arrays_data_RH[all_hem_arrays_data_RH["array"] == arr].reset_index(drop=True)
        arr_df_second = all_hem_arrays_data_RH[all_hem_arrays_data_RH["array"] == arr_next].reset_index(drop=True)
        
        arr_df_second["dif"] = arr_df_first["cost"] - arr_df_second["cost"]
        dif_std = arr_df_second["dif"].std()
        
        print("Array pair:", arr, arr_next)
        print("mean dif and srd:", round(arr_df_second["dif"].mean(), 4), round(dif_std, 4))

In [66]:
def post_hoc_comparisons(array_hem_data: pd.DataFrame):
    """Runs all post-hoc comparisons with Tukey's pairwise test.
    This inherently corrects for multiple comparisons, thus
    keeping the family-wise error rate at the specified alpha.
    
    The comparisons are done for the specified hemisphere.
    
    Parameters
    ----------
    array_data : pd.DataFrame
        The dataframe with the results data for one hemisphere.
        Columns: ["subject", "hemisphere", "array", "total_dice", "total_yield", "total_HD", "cost"]

    Returns
    -------
    results : A TukeyHSDResults instance
    """
    results = pairwise_tukeyhsd(endog=array_hem_data[["cost"]],
                                groups=array_hem_data[["array"]], alpha=0.05)
    
    return results

for hem in ["LH", "RH"]:
    hem_df = all_hem_arrays_data[all_hem_arrays_data["hemisphere"] == hem]
    total_observations = len(hem_df.subject.unique())
    print(f"COMPARISONS FOR {hem}, AND A TOTAL OF 5 ARRAYS WITH {total_observations} SUBJECTS:")
    print(post_hoc_comparisons(hem_df).summary())
    print("*"*52, "\n")

In [72]:
import pingouin as pg

s_LH = pg.sphericity(data=all_hem_arrays_data_LH, dv="cost", subject="subject", within="array")
s_RH = pg.sphericity(data=all_hem_arrays_data_RH, dv="cost", subject="subject", within="array")

# p val should be > 0.05
print(s_LH)
print(s_RH)

# Homogeneity of variances
homogeneity_test_LH = pg.homoscedasticity(all_hem_arrays_data_LH, dv="cost", group="array")
homogeneity_test_RH = pg.homoscedasticity(all_hem_arrays_data_RH, dv="cost", group="array")
print("Homogeneity of variances LH:\n", homogeneity_test_LH)
print("Homogeneity of variances RH:\n", homogeneity_test_RH)

In [33]:
visualize(all_arrays_data)

In [29]:
len(all_hem_arrays_data_LH.subject.unique())

In [30]:
len(all_hem_arrays_data_RH.subject.unique())

In [31]:
all_hem_arrays_data_LH

In [32]:
model_LH = AnovaRM(data=all_hem_arrays_data_LH, depvar="cost", subject="subject",
                    within=["array"]).fit()

model_RH = AnovaRM(data=all_hem_arrays_data_RH, depvar="cost", subject="subject", within=["array"]).fit()

print(model_LH.summary())
print(model_RH.summary())

In [25]:
all_arrays_data_LH

In [24]:
model_LH = AnovaRM(data=all_arrays_data_LH, depvar="cost", subject="subject",
                    within=["array"]).fit()

model_RH = AnovaRM(data=all_arrays_data_RH, depvar="cost", subject="subject", within=["array"]).fit()

print(model_LH.summary())
print(model_RH.summary())

In [105]:
model = AnovaRM(data=all_arrays_data, depvar="cost", subject="subject", within=["array", "hemisphere"]).fit()

model.summary()

In [86]:
from bioinfokit.analys import stat

"""p needs to be > 0.05 to reject the null, and to infer equal variances"""
res = stat()
res.levene(df=all_arrays_data, res_var="cost", xfac_var="array")
res.levene_summary

In [90]:
model = AnovaRM(data=all_arrays_data, depvar="cost", subject="subject",
                    within=["array", "hemisphere"]).fit()
model.summary()

In [93]:
def run_rm_anova(array_data: pd.DataFrame):
    """Runs a repeated measures anova for the given data.
    
    Parameters
    ----------
    array_data : pd.DataFrame
        The dataframe with the results data.
        Columns: ["subject", "hemisphere", "array", "total_dice", "total_yield", "total_HD", "cost"]

    Returns
    -------
    model : An AnovaRM instance
    """
    model = AnovaRM(data=array_data, depvar="cost", subject="subject",
                    within=["hemisphere", "array"]).fit()
    
    return model

for i, array_data in enumerate([array2_data, array3_data, array4_data, all_arrays_data]):
    total_observations = len(array_data.subject.unique())
    print(f"RM ANOVA FOR A TOTAL OF {i+2} ARRAYS WITH {total_observations} SUBJECTS:")
    print(run_rm_anova(array_data).summary())
    print("*"*52, "\n")

In [16]:
# Plotting
sns.barplot(data=all_arrays_data, x="array", y="cost", hue="hemisphere")

# Adding labels and title
plt.xlabel("Arrays")
plt.ylabel("Cost")
plt.ylim(2.2, 2.6)
plt.title("Cost per array and hemisphere")
plt.show()

In [None]:
"""A significant interaction between hemisphere:array means that the impact of hemisphere changes depending on the array.

A non-significant interaction between hemisphere:array suggests that the impact of the hemisphere remains the same across arrays. The effect of one factor (hemisphere) is consistent across different levels of the other factor (array). This suggests that the factors are independent in terms of their effect on the outcome variable (the cost)!"""