In [None]:
#!pip install statannotations==0.5.0
#!pip install seaborn==0.11.2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, re
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['font.family'] = 'Arial'

In [None]:
from statannotations.Annotator import Annotator

In [None]:
from scipy import stats

# Import model prediction

In [None]:
ipt_DIR_male = "../../../2_model_construction/lasso/out/MAPE/0.99/Lasso/male/tsfresh/"
ipt_DIR_female = "../../../2_model_construction/lightGBM/out/MAPE/0.99/LGBM/female/both/"

ipt_DIR_male = "/Users/s_magi/research/AI_experiment/220705/MAPE_bagging/0.99/Lasso/male/tsfresh/"
ipt_DIR_female = "/Users/s_magi/research/AI_experiment/220705/MAPE/0.99/LGBM/female/both/"

outDIR_figure = "../figure/"
outDIR = "../out/"
os.makedirs(outDIR, exist_ok=True)


In [None]:
df_pred_male = pd.read_csv(os.path.join(ipt_DIR_male,"pred_vs_true.csv"), usecols=[1,2,5],index_col=0)
df_pred_female = pd.read_csv(os.path.join(ipt_DIR_female,"pred_vs_true.csv"), usecols=[1,2,5],index_col=0)
df_pred = pd.concat([df_pred_male,df_pred_female])

# Import systemic parameters

In [None]:
df = pd.read_csv("../data/systemic_params.csv",index_col="group.cmp")

# Import matching results

In [None]:
df_match = pd.read_csv("../out/pred_vs_true_matched_male_and_female.csv")

### Figure 4A Colour coding by propensity score matching

In [None]:
new_col = "Class"
new_col_2 = "Class_2"
excluded_class = "Excluded"
younger_class_name = "Model-predicted younger"
older_class_name = "Model-predicted older"

df[new_col_2] = excluded_class
df.loc[df.Age * 0.9 > df.Predicted_age, new_col_2] = younger_class_name
df.loc[df.Age * 1.1 < df.Predicted_age, new_col_2] = older_class_name
df[new_col] = df[new_col_2]
df.loc[df.index.isin(df_match.query("pred_real==0")["group.cmp"]), new_col] = younger_class_name+ " (matched)"
df.loc[df.index.isin(df_match.query("pred_real==1")["group.cmp"]), new_col] = older_class_name+" (matched)"

# Comparison in components

In [None]:
df["BMI ($kg/m^{2}$)"] = df["Weight (kg)"] / (df["Height (cm)"]/100)**2

In [None]:
useobj_after = ["drug treatment for elevated blood pressure",
                "drug treatment for dyslipidemia",
                "drug treatment for elevated blood glucose"]
df = df.drop(useobj_after, axis=1)

# Comparison between matched samples

In [None]:
outDIR_th = outDIR_figure+ "/matched"
os.makedirs(outDIR_th, exist_ok=True)

df_tmp1=df.loc[~df[new_col].isin(transp_class)].copy().drop([new_col_2],axis=1)
df_tmp1[new_col] = df_tmp1[new_col].replace({older_class_name+" (matched)":older_class_name,younger_class_name+ " (matched)":younger_class_name})

In [None]:
score_all_mean = df_tmp1.groupby(new_col).mean().T
score_all_sd = df_tmp1.groupby(new_col).std().T
score_all_mean_str = score_all_mean.round(2).astype(str)
score_all_sd_str = score_all_sd.round(2).astype(str)
# Calculating mean ± sd for summary
score_all_str = score_all_mean_str + "±" + score_all_sd_str
grouped = df_tmp1.groupby(new_col)

# Shorten labels for plotting
short_label = "Model-predicted"
df_tmp1[short_label] = excluded_class
df_tmp1.loc[df_tmp1[new_col] == younger_class_name, short_label] = "Younger"
df_tmp1.loc[df_tmp1[new_col] == older_class_name, short_label] = "Older"

# Define category order
order = ["Younger", "Older"]

In [None]:
# Dictionary to store test statistics
tstats = {}

# Iterate through columns of the dataframe
for i in df_tmp1.columns:
    if pd.api.types.is_numeric_dtype(df_tmp1[i]):
        i_filename = re.sub(r'[\\/:*?"<>|\^\$\{\}\(\) ]+', '', i)
        plt.figure(figsize=(2, 3))

        # Create strip plot
        ax = sns.stripplot(x=short_label, y=i, data=df_tmp1, jitter=0.1, size=2, alpha=0.5, linewidth=.1, order=order)

        # Create point plot
        sns.pointplot(x=short_label, y=i, data=df_tmp1, join=False, capsize=0.1, color='black', scale=0.5, order=order)

        # Change line width
        for line in ax.lines:
            line.set_linewidth(1)

        # Add statistical annotation using statannotations
        if all(df_tmp1.groupby(short_label)[i].var() != 0):
            pairs = [("Younger", "Older")]
            annotator = Annotator(ax, pairs, data=df_tmp1, x=short_label, y=i, order=order)
            annotator.configure(test='Mann-Whitney', text_format='star', loc='inside', comparisons_correction=None)
            annotator.apply_and_annotate()

        # Add margin to the x-axis
        plt.margins(x=0.25) 
        # Save the figure
        plt.savefig(f"{outDIR_th}/{i_filename}_vs_predAge.pdf", bbox_inches="tight")
        plt.close()

        # If the column is binary, perform chi-square test and ROC curve
        if np.isin(df_tmp1[i].dropna().unique(), [0, 1]).all():
            cleaned_df = df_tmp1[[new_col, i]].dropna()
            crossed = pd.crosstab(cleaned_df[new_col], cleaned_df[i])
            tstats[i] = stats.chi2_contingency(crossed.dropna())[1]
        else:
            tstats[i] = stats.mannwhitneyu(df_tmp1[i][df_tmp1[new_col] == younger_class_name].dropna(),
                                           df_tmp1[i][df_tmp1[new_col] == older_class_name].dropna(),
                                           alternative="two-sided")[1]

df_summary = pd.concat([score_all_str, pd.Series(tstats, name="p_values")], axis=1)
df_summary.to_csv(f"{outDIR_th}/score_all_str.csv")
print(df_summary[df_summary["p_values"] < 0.05])

# Comparison between all samples

In [None]:
outDIR_th = outDIR_figure+ "/all"
os.makedirs(outDIR_th, exist_ok=True)

df_tmp1=df.loc[df[new_col_2].isin([older_class_name,younger_class_name])].copy()
df_tmp1[new_col] = df_tmp1[new_col_2]
df_tmp1 = df_tmp1.drop([new_col_2],axis=1)

In [None]:
score_all_mean = df_tmp1.groupby(new_col).mean().T
score_all_sd = df_tmp1.groupby(new_col).std().T
score_all_mean_str = score_all_mean.round(2).astype(str)
score_all_sd_str = score_all_sd.round(2).astype(str)
# Calculating mean ± sd for summary
score_all_str = score_all_mean_str + "±" + score_all_sd_str
grouped = df_tmp1.groupby(new_col)

# Shorten labels for plotting
short_label = "Model-predicted"
df_tmp1[short_label] = excluded_class
df_tmp1.loc[df_tmp1[new_col] == younger_class_name, short_label] = "Younger"
df_tmp1.loc[df_tmp1[new_col] == older_class_name, short_label] = "Older"

# Define category order
order = ["Younger", "Older"]

In [None]:
# Dictionary to store test statistics
tstats = {}

# Iterate through columns of the dataframe
for i in df_tmp1.columns:
    if pd.api.types.is_numeric_dtype(df_tmp1[i]):
        i_filename = re.sub(r'[\\/:*?"<>|\^\$\{\}\(\) ]+', '', i)
        plt.figure(figsize=(2, 3))

        # Create strip plot
        ax = sns.stripplot(x=short_label, y=i, data=df_tmp1, jitter=0.1, size=2, alpha=0.5, linewidth=.1, order=order)

        # Create point plot
        sns.pointplot(x=short_label, y=i, data=df_tmp1, join=False, capsize=0.1, color='black', scale=0.5, order=order)

        # Change line width
        for line in ax.lines:
            line.set_linewidth(1)

        # Add statistical annotation using statannotations
        if all(df_tmp1.groupby(short_label)[i].var() != 0):
            pairs = [("Younger", "Older")]
            annotator = Annotator(ax, pairs, data=df_tmp1, x=short_label, y=i, order=order)
            annotator.configure(test='Mann-Whitney', text_format='star', loc='inside', comparisons_correction=None)
            annotator.apply_and_annotate()

        # Add margin to the x-axis
        plt.margins(x=0.25) 
        # Save the figure
        plt.savefig(f"{outDIR_th}/{i_filename}_vs_predAge.pdf", bbox_inches="tight")
        plt.close()

        # If the column is binary, perform chi-square test and ROC curve
        if np.isin(df_tmp1[i].dropna().unique(), [0, 1]).all():
            cleaned_df = df_tmp1[[new_col, i]].dropna()
            crossed = pd.crosstab(cleaned_df[new_col], cleaned_df[i])
            tstats[i] = stats.chi2_contingency(crossed.dropna())[1]
        else:
            tstats[i] = stats.mannwhitneyu(df_tmp1[i][df_tmp1[new_col] == younger_class_name].dropna(),
                                           df_tmp1[i][df_tmp1[new_col] == older_class_name].dropna(),
                                           alternative="two-sided")[1]

df_summary = pd.concat([score_all_str, pd.Series(tstats, name="p_values")], axis=1)
df_summary.to_csv(f"{outDIR_th}/score_all_str.csv")
print(df_summary[df_summary["p_values"] < 0.05])