In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import os

base_dir = r"F:\Code\2025\ICLR_Flow_Disentangle\Diabetes_classification"
csv_path = os.path.join(base_dir, "diabetes.csv")


df = pd.read_csv(csv_path)



feature_cols = [c for c in df.columns if c != "Outcome"]
X = df[feature_cols].to_numpy(dtype=float)
y = df["Outcome"].to_numpy()


scaler = StandardScaler()
X_std = scaler.fit_transform(X)


X_npy_path = os.path.join(base_dir, "X_Diabetes_std.npy")
y_npy_path = os.path.join(base_dir, "y_Diabetes.npy")

np.save(X_npy_path, X_std)
np.save(y_npy_path, y)


df_std = pd.DataFrame(X_std, columns=feature_cols)
df_std["Outcome"] = y
csv_out_path = os.path.join(base_dir, "diabetes_std.csv")
df_std.to_csv(csv_out_path, index=False)




In [None]:
import numpy as np
x_npy_path = "Diabetes_classification/X_Diabetes_std.npy"
y_npy_path = "Diabetes_classification/y_Diabetes.npy"

y = np.load(y_npy_path)

X_full = np.load(x_npy_path); y = np.load(y_npy_path)
print(X_full.shape)
print(y.shape)
D = X_full.shape[1]

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
seed =42
n_jobs =20

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_full, y), start=1):
    X_tr, X_te = X_full[tr_idx], X_full[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_leaf=2,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy: {acc:.4f}")

print(f"Mean accuracy (2-fold): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")


## LOCO

In [None]:
from Inference.estimators_cls_cross import LOCOEstimator_cls
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier

estimator1 = LOCOEstimator_cls(



        regressor =  RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                min_samples_leaf=2,
                random_state=seed,
                n_jobs=n_jobs
                )

)


phi_0_loco, se_0_loco = estimator1.importance(X_full, y)


phi_0_loco_test = phi_0_loco 

se_0_loco_test = se_0_loco 

z_score_0_loco = phi_0_loco_test / se_0_loco_test

p_value_0_loco = 1 - norm.cdf(z_score_0_loco)
rounded_p_value_0_loco = np.round(p_value_0_loco, 3)

print(rounded_p_value_0_loco)


alpha = 0.05 / 8  

mask = (p_value_0_loco <= alpha).astype(int)

print(mask)

save_path = "Real_data/Diabetes/loco_results_cross.txt"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("Feature\tLOCO_0 φ\tStdError\tp_value\tSignificant\n")
    for j, (phi_j, se_j, pval, sig) in enumerate(zip(phi_0_loco, se_0_loco, rounded_p_value_0_loco, mask)):
        f.write(f"{j:>3d}\t{phi_j: .4f}\t{se_j: .4f}\t{pval:.3f}\t{sig}\n")
    f.write(f"\n Sum of LOCO_0: {D * np.mean(phi_0_loco):.6f}\n")

print(f"Results have been saved at {save_path}")


## CPI

In [None]:
from Inference.estimators_cls_cross import CPIEstimator_cls_normal
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier

estimator1 = CPIEstimator_cls_normal(
        regressor =  RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                min_samples_leaf=2,
                random_state=seed,
                n_jobs=n_jobs
                )
)

phi_0_cpi, se_0_cpi = estimator1.importance(X_full, y)


phi_0_cpi_test = phi_0_cpi 

se_0_cpi_test = se_0_cpi 

z_score_0_cpi = phi_0_cpi_test / se_0_cpi_test

p_value_0_cpi = 1 - norm.cdf(z_score_0_cpi)
rounded_p_value_0_cpi = np.round(p_value_0_cpi, 3)

print(rounded_p_value_0_cpi)

alpha = 0.05 / 8  


mask = (p_value_0_cpi <= alpha).astype(int)

print(mask)

save_path = "Real_data/Diabetes/cpi_results_cross.txt"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("Feature\tCPI_0 φ\tStdError\tp_value\tSignificant\n")
    for j, (phi_j, se_j, pval, sig) in enumerate(zip(phi_0_cpi, se_0_cpi, rounded_p_value_0_cpi, mask)):
        f.write(f"{j:>3d}\t{phi_j: .4f}\t{se_j: .4f}\t{pval:.3f}\t{sig}\n")
    f.write(f"\n Sum of CPI_0: {D * np.mean(phi_0_cpi):.6f}\n")

print(f"Results have been saved at {save_path}")



## FDFI

In [None]:
import sys
sys.path.append('ICLR_Flow_Disentangle')

import numpy as np
import torch


device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

from Flow_Matching.flow_matching import FlowMatchingModel


model = FlowMatchingModel(
    X=X_full,
    dim=D,
    device=device,
    hidden_dim=64,        
    time_embed_dim=64,     
    num_blocks=1,
    use_bn=False
)
model.fit(num_steps=5000, batch_size=256, lr=1e-3, show_plot=True)

In [None]:
from Inference.estimators_cls_cross import CPI_Flow_Model_Estimator_cls_normal
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier

estimator8 = CPI_Flow_Model_Estimator_cls_normal(
    regressor =  RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                min_samples_leaf=2,
                random_state=seed,
                n_jobs=n_jobs
                ),


    flow_model=model

)


phi_x_cpi, se_x_cpi = estimator8.importance(X_full, y)


phi_x_cpi_test = phi_x_cpi 

se_x_cpi_test = se_x_cpi 

z_score_x_cpi = phi_x_cpi_test / se_x_cpi_test


p_value_x_cpi = 1 - norm.cdf(z_score_x_cpi)
rounded_p_value_x_cpi = np.round(p_value_x_cpi, 3)

print(rounded_p_value_x_cpi)

alpha = 0.05 / 8  


mask = (p_value_x_cpi <= alpha).astype(int)

print(mask)

save_path = "Real_data/Diabetes/fdfi_results_cross.txt"


with open(save_path, "w", encoding="utf-8") as f:
    f.write("Feature\tFDFI φ\tStdError\tp_value\tSignificant\n")
    for j, (phi_j, se_j, pval, sig) in enumerate(zip(phi_x_cpi, se_x_cpi, rounded_p_value_x_cpi, mask)):
        f.write(f"{j:>3d}\t{phi_j: .4f}\t{se_j: .4f}\t{pval:.3f}\t{sig}\n")
    f.write(f"\n Sum of FDFI: {D * np.mean(phi_x_cpi):.6f}\n")

print(f"Results have been saved at {save_path}")



## DFI

In [None]:
from Inference.estimators_cls_cross import  DFIEstimator_cls
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier

estimator4 = DFIEstimator_cls(
    regressor =  RandomForestClassifier(
                n_estimators=200,
                max_depth=None,
                min_samples_leaf=2,
                random_state=seed,
                n_jobs=n_jobs
                )

)

phi_x_dfi, se_x_dfi = estimator4.importance(X_full, y)


phi_x_dfi_test = phi_x_dfi 

se_x_dfi_test = se_x_dfi 

z_score_x_dfi = phi_x_dfi_test / se_x_dfi_test

p_value_x_dfi = 1 - norm.cdf(z_score_x_dfi)
rounded_p_value_x_dfi = np.round(p_value_x_dfi, 3)

print(rounded_p_value_x_dfi)

alpha = 0.05 / 8  


mask = (p_value_x_dfi <= alpha).astype(int)

print(mask)

save_path = "Real_data/Diabetes/dfi_results_cross.txt"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("Feature\tDFI φ\tStdError\tp_value\tSignificant\n")
    for j, (phi_j, se_j, pval, sig) in enumerate(zip(phi_x_dfi, se_x_dfi, rounded_p_value_x_dfi, mask)):
        f.write(f"{j:>3d}\t{phi_j: .4f}\t{se_j: .4f}\t{pval:.3f}\t{sig}\n")
    f.write(f"\n Sum of DFI: {D * np.mean(phi_x_dfi):.6f}\n")

print(f"Results have been saved at {save_path}")


## Shapley Value

In [None]:
from Inference.estimators_cls_cross import ShapleyEstimator_cls
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import norm
estimator1 = ShapleyEstimator_cls(
    n_mc=100,         
    exact=False,       

    regressor =  RandomForestClassifier(
             n_estimators=200,
             max_depth=None,
             min_samples_leaf=2,
             random_state=seed,
             n_jobs=42
             )

)


phi0, se0 = estimator1.importance(X_full, y)



z_score_0 = phi0 / se0

p_value_0 = 1 - norm.cdf(z_score_0)


alpha = 0.05 / 8  


mask = (p_value_0 <= alpha).astype(int)

print(mask)

save_path = "Real_data/Diabetes/shap_results_cross.txt"

with open(save_path, "w", encoding="utf-8") as f:
    f.write("Feature\tShapley Value φ\tStdError\tp_value\tSignificant\n")
    for j, (phi_j, se_j, pval, sig) in enumerate(zip(phi0, se0, p_value_0 , mask)):
        f.write(f"{j:>3d}\t{phi_j: .4f}\t{se_j: .4f}\t{pval:.3f}\t{sig}\n")
    f.write(f"\n Sum of Shapley Value: {D * np.mean(phi0):.6f}\n")

print(f"Results have been saved at {save_path}")


# Plot

In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import numpy as np
import matplotlib.patches as mpatches
from pathlib import Path


files = {
    "LOCO": r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Diabetes\loco_results_cross.txt",
    "DFI":  r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Diabetes\dfi_results_cross.txt",
    "CPI":  r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Diabetes\cpi_results_cross.txt",
    "Shapley value": r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Diabetes\shap_results_cross.txt",
    "FDFI": r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Diabetes\fdfi_results_cross.txt",
}


color_map = {
    "LOCO": "#4C78A8",
    "CPI":  "#F58518",
    "FDFI": "#E45756",
    "DFI":  "#B279A2",
    "Shapley value": "#72B7B2",
}


single_star_by_idx = {
    "LOCO": {1, 5},
    "DFI":  {0, 1, 5, 6, 7},
    "CPI":  {1, 5, 7},
    "Shapley value": {0, 2},
    "FDFI": {0, 1, 4, 5, 6, 7},
}
double_star_by_idx = {
    "LOCO": {1, 5},
    "DFI":  {1, 5},
    "CPI":  {1, 5},
    "Shapley value": {0, 2},
    "FDFI": {0, 1, 5, 6, 7},
}


def _read_one_result(path: str, method_name: str) -> pd.DataFrame:

    with open(path, "r", encoding="utf-8") as f:
        raw_lines = f.readlines()

    cleaned = []
    for ln in raw_lines:
        s = ln.strip()
        if not s:
            continue
        sl = s.lower()
        if sl.startswith("sum of") or sl.startswith("sum"):
            continue
        cleaned.append(s)

    if not cleaned:
        raise ValueError(f"No table content found in {path}")

    df = pd.read_csv(StringIO("\n".join(cleaned)), sep="\t", engine="python")
    df.columns = [c.strip() for c in df.columns]

    feature_col = None
    for c in df.columns:
        if c.lower() == "feature":
            feature_col = c
            break
    if feature_col is None:
        feature_col = df.columns[0]  

    phi_col = None
    cand = [c for c in df.columns if "φ" in c]
    if not cand:
        cand = [c for c in df.columns if re.search(r"\bphi\b", c, flags=re.IGNORECASE)]
    if not cand:
        key = method_name.lower()
        cand = [c for c in df.columns if key in c.lower()]
    if not cand:
        numeric_cols = [c for c in df.columns if c != feature_col]
        if len(numeric_cols) == 1:
            cand = numeric_cols
    if not cand:
        raise ValueError(f"Cannot find φ/phi column for method '{method_name}' in {path}")
    phi_col = cand[0]

    sig_col = None
    for c in df.columns:
        lc = c.lower()
        if lc in ("significant", "sig") or ("signific" in lc):
            sig_col = c
            break
    if sig_col is None:
        df["Significant"] = 0
        sig_col = "Significant"

    out = df[[feature_col, phi_col, sig_col]].copy()
    out.columns = ["Feature", f"{method_name}_phi", f"{method_name}_sig"]

    out[f"{method_name}_phi"] = pd.to_numeric(out[f"{method_name}_phi"], errors="coerce")

    def _to01(v):
        if pd.isna(v):
            return 0
        if isinstance(v, (int, float, np.number)):
            return int(float(v) != 0.0)
        s = str(v).strip().lower()
        return 1 if s in ("1", "true", "t", "yes", "y") else 0

    out[f"{method_name}_sig"] = out[f"{method_name}_sig"].map(_to01).astype(int)
    return out

dfs = []
for name, path in files.items():
    df_i = _read_one_result(path, method_name=name)
    dfs.append(df_i)

df_all = dfs[0]
for i in range(1, len(dfs)):
    df_all = df_all.merge(dfs[i], on="Feature", how="outer")


df_all["Feature_raw"] = df_all["Feature"]


feature_map = {
    0: "Pregnancies", 1: "Glucose", 2: "BloodPressure", 3: "SkinThickness",
    4: "Insulin", 5: "BMI", 6: "DPF", 7: "Age"
}
df_all["Feature"] = df_all["Feature"].replace(feature_map)


phi_cols = [c for c in df_all.columns if c.endswith("_phi")]
sort_key = "FDFI_phi"
if sort_key not in df_all.columns:
    df_all["phi_mean_for_sort"] = df_all[phi_cols].mean(axis=1)
    sort_key = "phi_mean_for_sort"

df_top20 = df_all.sort_values(sort_key, ascending=False).head(20).reset_index(drop=True)


methods = list(files.keys())
n_methods = len(methods)

x = np.arange(len(df_top20))
bar_width = 0.14
group_width = bar_width * n_methods
offsets = np.linspace(-group_width/2 + bar_width/2, group_width/2 - bar_width/2, n_methods)

fig, ax = plt.subplots(1, 1, figsize=(11, 3.5))


ymax = float(df_top20[phi_cols].max().max())
txt_off = 0.03 * ymax
plot_max = ymax + txt_off + 0.05 * ymax
ax.set_ylim(0, plot_max)

sym_xoff = 0.16 * bar_width

for i, m in enumerate(methods):
    vals = df_top20.get(f"{m}_phi", pd.Series([0]*len(df_top20))).to_numpy()
    x_pos = x + offsets[i]

    bars = ax.bar(
        x_pos, vals, width=bar_width,
        edgecolor="black", color=color_map.get(m, "#999999"),
        label=m, linewidth=0.6
    )

    for j, bar in enumerate(bars):
        height = bar.get_height()
        x_mid = bar.get_x() + bar.get_width()/2.0

        raw_val = df_top20.loc[j, "Feature_raw"]
        try:
            raw_idx = int(raw_val)
        except Exception:
            raw_idx = None

        symbol = None
        if raw_idx is not None:
            if raw_idx in double_star_by_idx.get(m, set()):
                symbol = "**"
            elif raw_idx in single_star_by_idx.get(m, set()):
                symbol = "*"

        if symbol is not None and np.isfinite(height):
            ax.text(
                x_mid + sym_xoff, height + txt_off, symbol,
                va="center", ha="center", fontsize=14, fontweight="bold"
            )

ax.grid(axis="y", linestyle="--", alpha=0.3)
ax.set_ylabel("Feature Importance", fontsize=16)
ax.set_xticks(x)
ax.set_xticklabels(df_top20["Feature"], fontsize=16, rotation=25, ha="right")
ax.tick_params(axis="y", labelsize=16)

method_handles = [mpatches.Patch(color=color_map.get(m, "#999999"), label=m) for m in methods]
leg = fig.legend(
    handles=method_handles,
    loc="center left",
    bbox_to_anchor=(1.0, 0.63),
    frameon=False,
    ncol=1,
    handlelength=1.4,
    handletextpad=0.6,
    labelspacing=1.4,
    borderaxespad=0.0,
    fontsize=16
)

fig.tight_layout()

out_dir = Path(files["LOCO"]).parent
out_pdf = out_dir / "feature_importance_stars_single_panel_sorted_FDFI_right_legend_shifted_symbols.pdf"
plt.savefig(out_pdf, format="pdf", bbox_inches='tight', pad_inches=0., dpi=300)
plt.show()
print(f"PDF has been saved at {out_pdf}")
