In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import numpy as np
import os

In [None]:
path = ''
save_path = 'results/'+path.split('/')[-3]+'/plots/forestplot/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
to_plots = os.listdir(path)
# to_plots

In [None]:
def foreset_plot(df, title_txt="Forest Plot of Biomarkers Associated with Age"):
    df_sorted = df.sort_values("Beta").reset_index(drop=True)
    y_pos = np.arange(len(df_sorted))
    def get_color_and_fill(p):
        if p >= 0.05:
            return "black", False
        elif p >= 0.01:
            return "black", True
        elif p >= 0.001:
            return "blue", True
        elif p >= 0.0001:
            return "purple", True
        else:
            return "darkred", True
    fig, ax = plt.subplots(figsize=(8, len(df_sorted)*0.4))
    for i, row in df_sorted.iterrows():
        beta   = row["Beta"]
        ci_low = row["CI Lower"]
        ci_up  = row["CI Upper"]
        p      = row["P-value"]
        color, filled = get_color_and_fill(p)
        # error bar
        ax.errorbar(beta, i,xerr=[[beta - ci_low], [ci_up - beta]],fmt="none", 
                    ecolor="gray", elinewidth=1, capsize=3)
        # marker
        ax.plot(beta, i, "o", markerfacecolor=color if filled else "none",
                markeredgecolor=color, markersize=6)
    # 6) zero reference line
    ax.axvline(x=0, color="gray", linestyle="--", linewidth=1)
    # 7) y‐tick labels including N
    labels_with_n = [ f"{bio} (n={n})" for bio, n in zip(df_sorted["Biomarker"], df_sorted["N"])]
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels_with_n, fontsize=10, va="center")
    # 8) Labels, title, legend
    ax.set_xlabel("Beta (Effect Size)", fontsize=12)
    ax.set_title(title_txt, fontsize=14, fontweight="bold")
    legend_handles = [
        mpatches.Patch(color="darkred",    label="p < 0.0001"),
        mpatches.Patch(color="purple",     label="0.0001 ≤ p < 0.001"),
        mpatches.Patch(color="blue",       label="0.001 ≤ p < 0.01"),
        mpatches.Patch(color="black",      label="0.01 ≤ p < 0.05"),
        mpatches.Patch(facecolor="none",   edgecolor="black", label="p ≥ 0.05"),]
    ax.legend(handles=legend_handles, title="P-value", loc="lower right", fontsize=9)
    plt.tight_layout()
    ax.legend(handles=legend_handles, title='P-value Significance', loc='lower right')
    plt.savefig(save_path+title_txt.replace(" ", "_")+".png", dpi=300)
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def foreset_plot2(
    df,
    title_txt="ClinicalAge Gap Association with Biomarkers",
    dataset_color="steelblue",
    dataset_name="UK Biobank",
    save_path=None
):
    df_sorted = df.sort_values("Beta").reset_index(drop=True)
    y_pos = np.arange(len(df_sorted))

    # p-value to shape
    def shape_for_p(p):
        if p >= 0.05:      return "o"
        elif p >= 0.01:    return "s"
        elif p >= 0.001:   return "^"
        elif p >= 0.0001:  return "D"
        else:              return "v"

    fig, ax = plt.subplots(figsize=(8, len(df_sorted)*0.4 + 1))

    for i, row in df_sorted.iterrows():
        beta   = row["Beta"]
        ci_low = row["CI Lower"]
        ci_up  = row["CI Upper"]
        pval   = row["P-value"]
        shape  = shape_for_p(pval)

        # error bar
        ax.errorbar(beta, i, xerr=[[beta - ci_low], [ci_up - beta]],
                    fmt="none", ecolor="gray", elinewidth=1.2, capsize=4)

        # marker with shape
        ax.plot(beta, i, shape,
                color=dataset_color,
                markerfacecolor=dataset_color,
                markeredgecolor='black',
                markersize=7)

    # Reference line at 0
    ax.axvline(x=0, color="gray", linestyle="--", linewidth=1)

    # Y-axis: biomarker names with sample sizes
    labels_with_n = [
        f"{bio} (n={n})"
        for bio, n in zip(df_sorted["Biomarker"], df_sorted["N"])
    ]
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels_with_n, fontsize=10, va="center")

    # Labels, title
    ax.set_xlabel("Beta (Effect Size)", fontsize=12)
    ax.set_title(title_txt, fontsize=14, fontweight="bold")
    ax.set_ylim(-1, len(df_sorted))

    # Legend
    cohort_patch = mpatches.Patch(color=dataset_color, label=dataset_name)
    sig_handles = [
        plt.Line2D([], [], marker='o', color='black', label='p ≥ 0.05', linestyle=''),
        plt.Line2D([], [], marker='s', color='black', label='0.01 ≤ p < 0.05', linestyle=''),
        plt.Line2D([], [], marker='^', color='black', label='0.001 ≤ p < 0.01', linestyle=''),
        plt.Line2D([], [], marker='D', color='black', label='0.0001 ≤ p < 0.001', linestyle=''),
        plt.Line2D([], [], marker='v', color='black', label='p < 0.0001', linestyle=''),
    ]
    ax.legend(handles=[cohort_patch] + sig_handles,
              title='Cohort (color) / p-value (shape)',
              loc='lower right', fontsize=9)

    plt.tight_layout()

    # Save if path provided
    if save_path:
        import os
        os.makedirs(save_path, exist_ok=True)
        fname = title_txt.replace(" ", "_") + ".png"
        plt.savefig(os.path.join(save_path, fname), dpi=300)

    plt.show()

In [None]:
def cox_quantile_plot(df, title_txt="Forest Plot of Biomarkers Associated with Age"):
    # Sort by effect size (Beta)
    df_sorted = df.sort_values(by="Quartile Label", ascending=False)
    # Define color and fill style based on p-value
    def get_color_and_fill(p):
        if p >= 0.05:
            return "black", False
        elif p >= 0.01:
            return "black", True
        elif p >= 0.001:
            return "blue", True
        elif p >= 0.0001:
            return "purple", True
        else:
            return "darkred", True
    # Create the plot
    fig, ax = plt.subplots(figsize=(7, 9))
    # Plot each data point
    for beta, biomarker, ci_low, ci_up, p in zip(
        df_sorted["HR"], df_sorted["Quartile Label"],
        df_sorted["HR_lower"], df_sorted["HR_upper"],
        df_sorted["p"]):
        color, filled = get_color_and_fill(p)

        # Error bar
        ax.errorbar(
            beta, biomarker,
            xerr=[[beta - ci_low], [ci_up - beta]],
            fmt='none', ecolor='gray', elinewidth=1, capsize=3)

        # Marker (filled or hollow)
        ax.plot(
            beta, biomarker,
            'o',                         # circle marker
            markerfacecolor=color if filled else 'none',
            markeredgecolor=color,
            markersize=6)

    # Reference line at zero
    ax.axvline(x=1, color='grey', linestyle='--')
    # Labels and layout
    ax.set_xlabel("Hazard Ratio (HR)")
    ax.set_title(title_txt.replace("_", " "))
    plt.tight_layout()
    # Custom legend
    legend_handles = [
        mpatches.Patch(color='darkred', label='p < 0.0001'),
        mpatches.Patch(color='purple', label='0.0001 ≤ p < 0.001'),
        mpatches.Patch(color='blue', label='0.001 ≤ p < 0.01'),
        mpatches.Patch(color='black', label='0.01 ≤ p < 0.05'),
        mpatches.Patch(facecolor='none', edgecolor='black', label='p ≥ 0.05')]
        
    ax.legend(handles=legend_handles, title='P-value Significance', loc='lower right')
    plt.savefig(save_path+title_txt.replace(" ", "_")+".png", dpi=300)
    plt.show()

In [None]:
plot_file = 'UKB_correlation_age_biomarker_norm.csv'
print(f"Loading data from {plot_file}")
df = pd.read_csv(path+plot_file)
bi = ['Alanine aminotransferase', 'Albumin', 'Aspartate aminotransferase', 
        'High sensitivity C-reactive protein', 'Creatinine', 'Cystatin C',
        'Total bilirubin', 'Gamma glutamyltransferase', 
        'Insulin-like growth factor 1 (IGF-1)', 'Leukocyte telomere length']
age_df = df[df.Biomarker.isin(bi)]
id_df = df[df.Biomarker.isin(set(df.Biomarker) - set(bi))]

In [None]:
path = 'correlation_results_biomarkers_adjusted_covariates_Standard_beta.csv'
df = pd.read_csv(path)
bi = ['Alanine aminotransferase', 'Albumin', 'Aspartate aminotransferase', 
        'High sensitivity C-reactive protein', 'Creatinine', 'Cystatin C',
        'Total bilirubin', 'Gamma glutamyltransferase', 
        'Insulin-like growth factor 1 (IGF-1)', 'Leukocyte telomere length']
age_df = df[df.Biomarker.isin(bi)]
id_df = df[df.Biomarker.isin(set(df.Biomarker) - set(bi))]

In [None]:
foreset_plot2(age_df, title_txt="UKB ClinicaAge Gap Assotiated with Aging Biomarkers", save_path=save_path)

In [None]:
foreset_plot2(id_df, title_txt="UKB ClinicaAge Gap Assotiated with Aging Functional Biomarkers", save_path=save_path)


Cox Results for Disease

In [None]:
def cox_plot(df, title_txt="Forest Plot of Biomarkers Associated with Age"):
    # Sort by effect size (Beta)
    df_sorted = df.sort_values(by="HR")
    # Define color and fill style based on p-value
    # 2) replace underscores in the disease names
    df_sorted["Disease"] = df_sorted["Disease"].str.replace("_", " ")

    def get_color_and_fill(p):
        if p >= 0.05:
            return "black", False
        elif p >= 0.01:
            return "black", True
        elif p >= 0.001:
            return "blue", True
        elif p >= 0.0001:
            return "purple", True
        else:
            return "darkred", True
    # Create the plot
    fig, ax = plt.subplots(figsize=(7, len(df_sorted)*0.4))
    # Plot each data point
    for beta, biomarker, ci_low, ci_up, p in zip(
        df_sorted["HR"], df_sorted["Disease"],
        df_sorted["HR_lower"], df_sorted["HR_upper"],
        df_sorted["p"]):
        color, filled = get_color_and_fill(p)
        # Error bar
        ax.errorbar(beta, biomarker,
            xerr=[[beta - ci_low], [ci_up - beta]],
            fmt='none', ecolor='gray', elinewidth=1, capsize=4)

        # Marker (filled or hollow)
        ax.plot(beta, biomarker,'o',   
            markerfacecolor=color if filled else 'none',
            markeredgecolor=color, markersize=6)

    # Reference line at zero
    ax.axvline(x=1, color='grey', linestyle='--')
    # Labels and layout
    ax.set_xlabel("Hazard Ratio (HR)")
    ax.set_title(title_txt.replace("_", " "))
    plt.tight_layout()
    # Custom legend
    legend_handles = [
        mpatches.Patch(color='darkred', label='p < 0.0001'),
        mpatches.Patch(color='purple', label='0.0001 ≤ p < 0.001'),
        mpatches.Patch(color='blue', label='0.001 ≤ p < 0.01'),
        mpatches.Patch(color='black', label='0.01 ≤ p < 0.05'),
        mpatches.Patch(facecolor='none', edgecolor='black', label='p ≥ 0.05')]
        
    ax.legend(handles=legend_handles, title='P-value Significance', loc='lower right')
    plt.savefig(save_path+title_txt.replace(" ", "_")+".png", dpi=300)

    plt.show()

In [None]:
def cox_plot2(
    df,
    title_txt="ClinicalAge Association with Biomarkers",
    dataset_color="steelblue",
    dataset_name="UK Biobank",
    save_path=None
):
    # Sort by HR
    df_sorted = df.sort_values(by="HR")
    df_sorted["Disease"] = df_sorted["Disease"].str.replace("_", " ")

    # Define p-value to marker shape
    def shape_for_p(p):
        if p >= 0.05:
            return "o"   # circle
        elif p >= 0.01:
            return "s"   # square
        elif p >= 0.001:
            return "^"   # triangle up
        elif p >= 0.0001:
            return "D"   # diamond
        else:
            return "v"   # triangle down

    # Set up plot
    N = len(df_sorted)
    y = list(range(N))
    fig, ax = plt.subplots(figsize=(7, N * 0.4 + 1))

    for i, row in enumerate(df_sorted.itertuples()):
        hr, lo, hi, pval, name = row.HR, row.HR_lower, row.HR_upper, row.p, row.Disease
        shape = shape_for_p(pval)
        ax.errorbar(hr, i,
                    xerr=[[hr - lo], [hi - hr]],
                    fmt='none', ecolor='gray', elinewidth=1.2, capsize=4)
        ax.plot(hr, i, shape,
                color=dataset_color,
                markerfacecolor=dataset_color,
                markeredgecolor='black',
                markersize=7)

    # Reference line
    ax.axvline(x=1, color='grey', linestyle='--')

    # Labels
    ax.set_yticks(y)
    ax.set_yticklabels(df_sorted["Disease"], fontsize=10)
    ax.set_xlabel("Hazard Ratio (HR)", fontsize=12)
    ax.set_title(title_txt, fontsize=14, fontweight='bold')
    ax.set_ylim(-1, N)

    # Legend
    sig_handles = [
        plt.Line2D([], [], marker='o', color='black', label='p ≥ 0.05', linestyle=''),
        plt.Line2D([], [], marker='s', color='black', label='0.01 ≤ p < 0.05', linestyle=''),
        plt.Line2D([], [], marker='^', color='black', label='0.001 ≤ p < 0.01', linestyle=''),
        plt.Line2D([], [], marker='D', color='black', label='0.0001 ≤ p < 0.001', linestyle=''),
        plt.Line2D([], [], marker='v', color='black', label='p < 0.0001', linestyle=''),
    ]
    dataset_patch = mpatches.Patch(color=dataset_color, label=dataset_name)
    ax.legend(handles=[dataset_patch] + sig_handles,
              title="Cohort (color) / p-value (shape)",
              loc="lower right", fontsize=9)

    plt.tight_layout()

    # Save
    if save_path:
        import os
        os.makedirs(save_path, exist_ok=True)
        fname = title_txt.replace(" ", "_") + ".png"
        fig.savefig(os.path.join(save_path, fname), dpi=300)

    plt.show()

In [None]:
path = ''
plot_file = 'cprd_disease_regression_allcov.csv'
plot_file2 = 'UKB_disease_regression_allcov.csv'
print(f"Loading data from {plot_file}")
df = pd.read_csv(path+plot_file)
df2 = pd.read_csv(path+plot_file2)
cancer_cols = ['Lung_cancer', 'Non-Hodgkin_lymphoma', 'Esophageal_cancer','Leukemia',
               'Colorectal_cancer',  'Brain_cancer', 'Breast_cancer', 'Liver_cancer',
               'Esophageal_cancer', 'Prostate_cancer', 'Ovarian_cancer', 'Pancreatic_cancer']
non_c_cols= set(df.Disease) - set(cancer_cols)
cancer_df = df[df.Disease.isin(cancer_cols)]
non_cancer_df = df[df.Disease.isin(non_c_cols)]
cancer_df2 = df2[df2.Disease.isin(cancer_cols)]
non_cancer_df2 = df2[df2.Disease.isin(non_c_cols)]

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

def cox_plot_two(
    df1,
    df2,
    dataset1_name="Cohort 1",
    dataset2_name="Cohort 2",
    
    title_txt="Forest Plot Comparison",
    save_path=None
):
    """
    Overlay two Cox‐model forest plots for common diseases in df1 and df2,
    using marker *shape* for p‐value significance and marker *color* for dataset.
    """
    # 1) Common diseases, sorted by df1’s HR
    df1['Disease'] = df1['Disease'].str.replace("_", " ")
    df2['Disease'] = df2['Disease'].str.replace("_", " ")
    common = sorted(
        set(df1["Disease"]).intersection(df2["Disease"]),
        key=lambda d: df1.set_index("Disease").loc[d, "HR"]
    )
    N = len(common)
    y = np.arange(N)
    offset = 0.2

    # 2) Significance → marker shape
    def shape_for_p(p):
        if p >= 0.05:      return "o"   # circle
        elif p >= 0.01:    return "s"   # square
        elif p >= 0.001:   return "^"   # triangle up
        elif p >= 0.0001:  return "D"   # diamond
        else:              return "v"   # triangle down

    # 3) Dataset colors
    color1 = "steelblue"
    color2 = "indianred"

    # 4) Compute x-limits across both
    all_lo = min(df1["HR_lower"].min(), df2["HR_lower"].min())
    all_hi = max(df1["HR_upper"].max(), df2["HR_upper"].max())
    x_min, x_max = all_lo * 0.95, all_hi * 1.05

    # 5) Create figure
    fig, ax = plt.subplots(figsize=(8, N*0.4 + 1))

    # 6) Plot cohort1 (blue) at y+offset
    for i, disease in enumerate(common):
        row = df1[df1["Disease"] == disease].iloc[0]
        hr, lo, hi, p = row["HR"], row["HR_lower"], row["HR_upper"], row["p"]
        shape = shape_for_p(p)
        yi = y[i] + offset
        ax.errorbar(hr, yi,
                    xerr=[[hr-lo], [hi-hr]],
                    fmt='none', ecolor='gray', capsize=4, elinewidth=1.5, zorder=1)
        ax.plot(hr, yi, shape,
                color=color1,
                markerfacecolor=color1,
                markeredgecolor='black',
                markersize=7,
                label='_nolegend_')

    # 7) Plot cohort2 (red) at y-offset
    for i, disease in enumerate(common):
        row = df2[df2["Disease"] == disease].iloc[0]
        hr, lo, hi, p = row["HR"], row["HR_lower"], row["HR_upper"], row["p"]
        shape = shape_for_p(p)
        yi = y[i] - offset
        ax.errorbar(hr, yi,
                    xerr=[[hr-lo], [hi-hr]],
                    fmt='none', ecolor='gray', capsize=4, elinewidth=1.5, zorder=1)
        ax.plot(hr, yi, shape,
                color=color2,
                markerfacecolor=color2,
                markeredgecolor='black',
                markersize=7,
                label='_nolegend_')

    # 8) Reference line
    ax.axvline(1, color='gray', linestyle='--', linewidth=1)

    # 9) Y-axis
    ax.set_yticks(y)
    ax.set_yticklabels(common, fontsize=10)
    ax.set_ylim(-1, N)

    # 10) Labels & title
    ax.set_xlabel("Hazard Ratio (HR)", fontsize=12)
    ax.set_title(title_txt, fontsize=14, fontweight="bold")
    ax.set_xlim(x_min, x_max)

    # 11) Legend: dataset colors + shape mapping
    cohort_handles = [
        mpatches.Patch(color=color1, label=dataset1_name),
        mpatches.Patch(color=color2, label=dataset2_name),
    ]
    sig_handles = [
        plt.Line2D([], [], marker='o', color='black', label='p ≥ 0.05', linestyle=''),
        plt.Line2D([], [], marker='s', color='black', label='0.01 ≤ p < 0.05', linestyle=''),
        plt.Line2D([], [], marker='^', color='black', label='0.001 ≤ p < 0.01', linestyle=''),
        plt.Line2D([], [], marker='D', color='black', label='0.0001 ≤ p < 0.001', linestyle=''),
        plt.Line2D([], [], marker='v', color='black', label='p < 0.0001', linestyle=''),
    ]
    ax.legend(
        handles=cohort_handles + sig_handles,
        title="Cohort (color) / p-value (shape)",
        loc="lower right",
        fontsize=9
    )

    plt.tight_layout()
    # 12) Save
    if save_path:
        os.makedirs(save_path, exist_ok=True)
        fname = title_txt.replace(" ", "_") + ".png"
        fig.savefig(os.path.join(save_path, fname), dpi=300)

    plt.show()

In [None]:
cox_plot_two(
    cancer_df,
    cancer_df2,
    dataset1_name="CPRD",
    dataset2_name="UKB",
    title_txt="UKB and CPRD Cancer Hazard Ratio Comparison",
    save_path=path
)

In [None]:
cox_plot_two(
    non_cancer_df,
    non_cancer_df2,

    dataset1_name="CPRD",
    dataset2_name="UKB",
    title_txt="UKB and CPRD Non-Cancer Hazard Ratio Comparison",
    save_path=path
)

In [None]:
plot_file = 'cprd_disease_regression_agesex.csv'
plot_file2 = 'UKB_disease_regression_agesex.csv'
print(f"Loading data from {plot_file}")
df = pd.read_csv(path+plot_file)
df2 = pd.read_csv(path+plot_file2)
cancer_cols = ['Lung_cancer', 'Non-Hodgkin_lymphoma', 'Esophageal_cancer','Leukemia',
               'Colorectal_cancer',  'Brain_cancer', 'Breast_cancer', 'Liver_cancer',
               'Esophageal_cancer', 'Prostate_cancer', 'Ovarian_cancer', 'Pancreatic_cancer']
non_c_cols= set(df.Disease) - set(cancer_cols)
cancer_df = df[df.Disease.isin(cancer_cols)]
non_cancer_df = df[df.Disease.isin(non_c_cols)]
cancer_df2 = df2[df2.Disease.isin(cancer_cols)]
non_cancer_df2 = df2[df2.Disease.isin(non_c_cols)]

In [None]:
cox_plot_two(
    non_cancer_df2,
    non_cancer_df,
    dataset1_name="UKB",
    dataset2_name="CPRD",
    title_txt="UKB and CPRD Non-Cancer Hazard Ratio Comparison Age Sex",
    save_path=path
)

In [None]:
cox_plot_two(
    cancer_df2,
    cancer_df,
    dataset1_name="UKB",
    dataset2_name="CPRD",
    title_txt="UKB and CPRD Cancer Hazard Ratio Comparison Age Sex",
    save_path=path
)