In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Summarise the data

In [129]:
def summarise(df: pd.DataFrame, data_name: str) -> pd.DataFrame:

    # features
    n_features = len(df.columns) - 2

    # outcome
    outcome_series = df.death_2years.dropna()
    n_outcome = len(outcome_series)
    percent_outcome = round(100*outcome_series.mean(), 1)

    # final
    summary_df = pd.DataFrame({
        "Dataset": [data_name],
        "Observations": [n_outcome],
        "Number of features": [n_features],
        "Percent dead within 2 years": [f"{percent_outcome}%"]
    })

    return summary_df

In [130]:
# load data
clinical1 = pd.read_csv("../data/clean/clinical1.csv")
clinical2 = pd.read_csv("../data/clean/clinical2.csv")
clinical_joined = pd.read_csv("../data/clean/clinical_joined.csv")

image_dict = {f"batch_{i}": pd.read_csv(f"../data/clean/image_features_{i}.csv") for i in range(1, 13)}

image_features = pd.concat([df for df in image_dict.values()])

full_data = clinical_joined.merge(right=image_features.drop(columns=["death_2years"]),
                                  on="patient_id",
                                  how="left")

In [131]:
data = [
    summarise(clinical1, "Clinical1"),
    summarise(clinical2, "Clinical2"),
    summarise(clinical_joined, "Clinical Joined"),
    summarise(image_features, "Image Features"),
    summarise(full_data, "Full Data")
]
summary = pd.concat(data, axis=0).set_index("Dataset")

In [132]:
summary

Unnamed: 0_level_0,Observations,Number of features,Percent dead within 2 years
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clinical1,420,7,59.8%
Clinical2,180,26,20.0%
Clinical Joined,600,7,47.8%
Image Features,421,19,59.9%
Full Data,600,26,47.8%


# Format results

In [133]:
# load the results
clinical1_results = pd.read_csv("../results/clinical1_model_metrics.csv", header=[0, 1])
clinical2_results = pd.read_csv("../results/clinical2_model_metrics.csv", header=[0, 1])
clinical_joined_results = pd.read_csv("../results/clinical_joined_model_metrics.csv", header=[0, 1])
image_features_results = pd.read_csv("../results/image_features_model_metrics.csv", header=[0, 1])
full_data_results = pd.read_csv("../results/full_data_model_metrics.csv", header=[0, 1])

In [134]:
results = [
    clinical1_results,
    clinical2_results,
    clinical_joined_results,
    image_features_results,
    full_data_results
]

metrics = pd.concat(results, axis=0)

column_names = [("", "Dataset")] + [metrics.columns[i] for i in range(1, len(metrics.columns))]
columns = pd.MultiIndex.from_tuples(column_names)
metrics.columns = columns

metrics = metrics.reset_index(drop=True)

In [135]:
metrics

Unnamed: 0_level_0,Unnamed: 1_level_0,LASSO,LASSO,LASSO,MLP,MLP,MLP
Unnamed: 0_level_1,Dataset,Weighted Accuracy,AUC,F1 Score,Weighted Accuracy,AUC,F1 Score
0,Clinical1,0.5,0.48,0.7,0.52,0.49,0.67
1,Clinical2,0.88,0.9,0.75,0.74,0.87,0.53
2,Clinical Joined,0.66,0.69,0.63,0.66,0.69,0.64
3,Image Features,0.51,0.53,0.64,0.51,0.53,0.58
4,Full Data,0.66,0.71,0.62,0.64,0.69,0.57


# Create subplot figures

In [None]:
images = ["../results/feature_plots/lreg_clinical2_model_coefs.png",
          "../results/feature_plots/mlp_clinical2_model_shap.png"]

In [222]:
fig, ax = plt.subplot_mosaic([["a)"], ["b)"]], layout="constrained", figsize=(16, 16))

lreg_img = plt.imread(images[0])
mlp_img = plt.imread(images[1])

ax["a)"].text(0.0, 1.0, "a)", fontsize=18, va='bottom', fontfamily='serif')
ax["a)"].imshow(lreg_img)
ax["a)"].axis("off")

ax["b)"].text(0.0, 1.0, "b)", fontsize=18, va='top', fontfamily='serif')
ax["b)"].imshow(mlp_img)
ax["b)"].axis("off")

plt.savefig("../results/clinical2_feature_importance.png")
plt.close()