In [None]:
from utils import load_best_features
import pandas as pd
import os
import sys

sys.path.append("../../src")

DATA_FOLDER = "../datasets.nosync/"

clinical_df = pd.read_csv(os.path.join(DATA_FOLDER, "clinical_train.csv"))
features = load_best_features("../results_fs/clinical_best_features.txt")

In [None]:
from visualization import plot_difference, plot_multiple_spectra
from utils import load_best_features
import pandas as pd
import os
import sys

sys.path.append("../../src")

DATA_FOLDER = "../datasets.nosync/"

nmr_df = pd.read_csv(os.path.join(DATA_FOLDER, "nmr_train.csv"))
features = load_best_features("../results_fs/nmr_best_features.txt")
plot_multiple_spectra(nmr_df)
plot_difference(nmr_df)
plot_difference(nmr_df, features=features)

In [None]:
from visualization import plot_cumulative_variance

plot_cumulative_variance()

In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
all_results = {}
results_folder = "../results_models/baselines/"

for f in os.listdir(results_folder):
    if f.endswith(".csv"):
        dataset = f.split(".csv")[0].split("_baseline_results")[0].title()

        all_results[dataset] = pd.read_csv(f"{results_folder}/{f}").drop(columns=["Unnamed: 0"])
        all_results[dataset]["Dataset"] = dataset

for f in os.listdir("../results_models/deepsurv/"):
    if f.endswith(".csv"):
        dataset = f.split(".csv")[0].split("_deepsurv_results")[0].title()
        deepsurv = pd.read_csv(f"../results_models/deepsurv/{f}").drop(columns=["Unnamed: 0"])[["DeepSurv"]]
        all_results[dataset] = pd.concat([all_results[dataset], deepsurv], axis=1)

for f in os.listdir("../results_models/xgb-aft/bagging2024-06-26_11-54-15/"):
    if f.endswith(".csv") and "test_results" in f:
        dataset = f.split(".csv")[0].split("test_results_")[-1].title()
        xgb = pd.read_csv(
            f"../results_models/xgb-aft/bagging2024-06-26_11-54-15/{f}"
        ).drop(columns=["Unnamed: 0"])[["C-index"]]
        xgb.rename(columns={"C-index": "XGBoost"}, inplace=True)
        all_results[dataset] = pd.concat([all_results[dataset], xgb], axis=1)

# Concatenate all results
all_results = pd.concat([res for res in all_results.values()])
new = all_results.set_index(["Dataset"])
result = (
    new.groupby(["Dataset"]).mean().round(3).reset_index()
)
# add standard deviation per result to the table
for i in result.columns:
    if i != "Dataset":
        result[i] = result[i].astype(str) + "\u00B1" + new.groupby(["Dataset"]).std().round(3).reset_index()[i].astype(str)
result = result.set_index(["Dataset"])
# rename indices
result.rename(index={"Nmr": "NMR", "Full": "Clinical + NMR"}, inplace=True)
print(result)
result.to_latex(os.path.join(results_folder, "baselines_table.tex"), multirow=True, multicolumn=True, multicolumn_format="c", bold_rows=True, caption="Baseline models' performances on each dataset.", label="tab:baselines_res", float_format="%.2f")
all_results.fillna(0, inplace=True)

In [None]:
df_melted = all_results.melt(id_vars="Dataset", var_name="Model", value_name="Value")

fig, axes = plt.subplots(1, 3, sharey=True, figsize=(12, 6))

# get palette colors
palette = sns.color_palette("Set2", 3)
palette = {
    "Nmr": palette[2],
    "Clinical": palette[1],
    "Full": palette[0],
}
switch_name = {
    "Nmr": "NMR",
    "Clinical": "Clinical",
    "Full": "Clinical + NMR",

}
sns.set_theme(style="whitegrid")
for ax, dataset in zip(axes, ["Nmr", "Clinical", "Full"]):
    bp = sns.boxplot(
        data=df_melted[df_melted["Dataset"] == dataset],
        x="Model",
        y="Value",
        hue="Dataset",
        palette=palette,
        ax=ax,
        width=0.5,
    )
    ax.set_xlabel(switch_name[dataset])
    ax.set_ylabel("C-index")
    ax.set_ylim(0.5, 0.91)

# rotate x-tick labels
for ax in axes:
    ax.xaxis.set_tick_params(rotation=45)

# remove legends apart from last axis
for ax in axes:
    ax.legend([], [], frameon=False)

# despine, remove y-ticks & y-label on all but the first axis
sns.despine(ax=axes[0], trim=True, offset=10)
axes[0].grid(axis="y", linestyle="--", color="grey", alpha=0.7)
for ax in axes[1:]:
    sns.despine(ax=ax, left=True, trim=True, offset=10)
    ax.grid(axis="y", linestyle="--", color="grey", alpha=0.7)

# plt.legend(title="Dataset", loc="lower right", labels=["NMR", "Clinical", "Clinical + NMR"], handles=[plt.Line2D([0], [0], color=palette["Nmr"], lw=4), plt.Line2D([0], [0], color=palette["Clinical"], lw=4), plt.Line2D([0], [0], color=palette["Full"], lw=4)])
# Move clinical + NMR to the right
plt.tight_layout()
plt.savefig(os.path.join(results_folder, "boxplot_baselines_separate.pdf"))
plt.show()

In [None]:
import numpy as np
import os
from survival_dataset import SurvivalDataset
from visualization import (
    plot_survival_time,
    plot_survival_sd,
    plot_survival_calibration,
    plot_boxplots,
)

plot_path = "../results_models/xgb-aft/bagging2024-06-26_11-54-15"
for f in os.listdir(plot_path):
    if f.endswith(".npy"):
        if "test" not in f or "full" not in f:
            continue
        ys = np.load(os.path.join(plot_path, f))
        ds = f.split("_")[2].split(".npy")[0]
        dataset_test = SurvivalDataset()
        dataset_test.load_data(f"{ds}_test.csv")
        plot_survival_time(
            dataset_test,
            ys,
            interval=True,
            plot_path=plot_path if ds == "full" else None,
        )

        std_devs = ys.std(axis=0)
        ypred = ys.mean(axis=0)
        plot_survival_sd(
            dataset_test,
            ypred,
            std_devs,
            plot_path=plot_path if ds == "full" else None,
        )
        plot_survival_calibration(
            dataset_test,
            ys,
            plot_path=plot_path if ds == "full" else None,
        )
        plot_boxplots(plot_path)