# Preprocess

In [None]:

import pandas as pd
import numpy as np
from pathlib import Path

INPUT_CSV = Path("CTG.csv") 
OUT_PREFIX = INPUT_CSV.with_name("Cardiotocography") 


REQUIRED_FEATS = [
    "LB", "AC", "FM", "UC",
    "ASTV", "mSTV", "ALTV", "mLTV",
    "DL", "DS", "DP",
    "Width", "Min", "Max",
    "Nmax", "Nzeros",
    "Mode", "Mean", "Median", "Variance", "Tendency",
]


ALIASES = {
    "mstv": ["mstv", "m_stv", "m stv"],
    "mltv": ["mltv", "m_ltv", "m ltv"],
    "nmax": ["nmax", "n_max", "n max"],
    "nzeros": ["nzeros", "n_zeros", "n zeros", "nz", "nzero"],
    "astv": ["astv"],
    "altv": ["altv"], 
}

def norm(s: str) -> str:
    return str(s).strip().lower().replace("_", "").replace(" ", "")

def resolve_col(name: str, cols) -> str:

    target = norm(name)
    col_map = {norm(c): c for c in cols}

    if target in col_map: 
        return col_map[target]
    if target in ALIASES:  
        for a in ALIASES[target]:
            aa = norm(a)
            if aa in col_map:
                return col_map[aa]


def main():

    df = pd.read_csv(
        INPUT_CSV,
        na_values=["", "NA", "N/A", "null", "Null", "NULL", "NaN"]
    ).replace(r"^\s*$", pd.NA, regex=True)


    feat_cols = [resolve_col(name, df.columns) for name in REQUIRED_FEATS]
    nsp_col = resolve_col("NSP", df.columns)


    df = df[feat_cols + [nsp_col]].copy()


    nsp_num = pd.to_numeric(df[nsp_col], errors="coerce")
    if nsp_num.notna().any():

        df[nsp_col] = (nsp_num == 3).astype("Int64")
    else:
        nsp_str = df[nsp_col].astype(str).str.strip().str.upper()
        df[nsp_col] = nsp_str.map({"P": 1, "N": 0, "S": 0}).astype("Int64")



    df = df.dropna(axis=0).reset_index(drop=True)



    X = df[feat_cols].copy()
    y = df[nsp_col].astype(int).copy()


    mu = X.mean(axis=0)
    sd = X.std(axis=0, ddof=0).replace(0, 1.0)  
    X_std = (X - mu) / sd


    merged_out = OUT_PREFIX.with_name("CTG_21feats_binary_Cardiotocography.csv")
    merged_df = X_std.copy()
    merged_df[nsp_col] = y.values
    merged_df.to_csv(merged_out, index=False)


    X_csv = OUT_PREFIX.with_name("X_Cardiotocography.csv")
    y_csv = OUT_PREFIX.with_name("y_Cardiotocography.csv")
    X_npy = OUT_PREFIX.with_name("X_Cardiotocography.npy")
    y_npy = OUT_PREFIX.with_name("y_Cardiotocography.npy")

    X_std.to_csv(X_csv, index=False)
    y.to_csv(y_csv, index=False, header=[nsp_col])
    np.save(X_npy, X_std.to_numpy())
    np.save(y_npy, y.to_numpy())


    print("CSV：", merged_out.resolve())
    print("X CSV：  ", X_csv.resolve())
    print("y CSV：  ", y_csv.resolve())
    print("X NPY：  ", X_npy.resolve())
    print("y NPY：  ", y_npy.resolve())

if __name__ == "__main__":
    main()


In [None]:
import numpy as np
x_npy_path = "ICLR_Flow_Disentangle/X_Cardiotocography.npy"
y_npy_path = "ICLR_Flow_Disentangle/y_Cardiotocography.npy"

y = np.load(y_npy_path)

X_full = np.load(x_npy_path); y = np.load(y_npy_path)
print(X_full.shape)
print(y.shape)
D = X_full.shape[1]




from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
seed =42
n_jobs =20
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_full, y), start=1):
    X_tr, X_te = X_full[tr_idx], X_full[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=3,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy: {acc:.4f}")

print(f"Mean accuracy (2-fold): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")


# Cluster

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr
from pathlib import Path


x_npy_path = "ICLR_Flow_Disentangle/X_Cardiotocography.npy"
y_npy_path = "ICLR_Flow_Disentangle/y_Cardiotocography.npy"

X_full = np.load(x_npy_path)   
y = np.load(y_npy_path)       

n, D = X_full.shape
print("X shape:", X_full.shape, " y shape:", y.shape)


ctg_names = [
    "LB","AC","FM","UC","DL","DS","DP",
    "ASTV","MSTV","ALTV","MLTV",
    "Width","Min","Max","Nmax","Nzeros","Mode","Mean","Median","Variance","Tendency"
]
if D == 21:
    feature_names = ctg_names
else:
    feature_names = [f"f{i}" for i in range(D)]

X = pd.DataFrame(X_full, columns=feature_names)


corr = spearmanr(X).correlation


corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1.0)
corr = np.nan_to_num(corr, nan=0.0, posinf=0.0, neginf=0.0)


distance_matrix = 1.0 - np.abs(corr)
dist_linkage = hierarchy.ward(squareform(distance_matrix))


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

dendro = hierarchy.dendrogram(
    dist_linkage,
    labels=X.columns.to_list(),
    ax=ax1,
    leaf_rotation=90,
)


ax1.set_title("Hierarchical Clustering", fontsize=17)  
ax1.set_ylabel("Linkage height", fontsize=15)         


order = dendro["leaves"]

im = ax2.imshow(corr[order, :][:, order], aspect="auto",cmap="RdBu_r")

ax2.set_xticks(np.arange(len(order)+1)-0.5, minor=True)
ax2.set_yticks(np.arange(len(order)+1)-0.5, minor=True)
ax2.grid(which="minor", color="white", linestyle="-", linewidth=0.5)
ax2.tick_params(which="minor", bottom=False, left=False)



ax2.set_title("Spearman correlation", fontsize=17)    
ax2.set_xticks(np.arange(len(order)))
ax2.set_yticks([])


ax2.set_xticklabels([X.columns[i] for i in order], rotation=90, fontsize=15)


cbar = fig.colorbar(im, ax=ax2, fraction=0.046, pad=0.04)
cbar.ax.tick_params(labelsize=12)   


for ax in [ax1, ax2]:
    ax.tick_params(axis="x", labelsize=15)  
    ax.tick_params(axis="y", labelsize=12)  

fig.tight_layout()


out_pdf = Path("ICLR_Flow_Disentangle/CTG_spearman_hclust.pdf")
fig.savefig(out_pdf, format="pdf", bbox_inches="tight", pad_inches=0., dpi=300)

plt.show()
print(f"PDF has been saved at {out_pdf}")


# LOCO

In [None]:
from Inference.estimators_cls_cross import LOCOEstimator_cls
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from pathlib import Path


estimator1 = LOCOEstimator_cls(



        regressor =  RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                random_state=seed,
                n_jobs=n_jobs
                )

)


phi_0_loco, se_0_loco = estimator1.importance(X_full, y)



phi_0_loco_test = phi_0_loco 

se_0_loco_test = se_0_loco 

z_score_0_loco = phi_0_loco_test / se_0_loco_test

p_value_0_loco = 1 - norm.cdf(z_score_0_loco)
rounded_p_value_0_loco = np.round(p_value_0_loco, 3)

print(rounded_p_value_0_loco)


alpha = 0.05 / 21  


mask = (p_value_0_loco <= alpha).astype(int)

print(mask)


df_out = pd.DataFrame({
    "feature": np.arange(len(phi_0_loco)),
    "phi": phi_0_loco,
    "std_error": se_0_loco,
    "z_score": z_score_0_loco,
    "p_value": p_value_0_loco,
    "p_value_rounded": rounded_p_value_0_loco,
    "sig_bonferroni_alpha_0.05/21": mask
})


output_csv = "ICLR_Flow_Disentangle/Real_data/Cardiotocography/LOCO_0_results.csv"


Path(output_csv).parent.mkdir(parents=True, exist_ok=True)


df_out.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Saved LOCO_0 results to: {output_csv}")



# CPI

In [None]:
from Inference.estimators_cls_cross import CPIEstimator_cls_normal
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from pathlib import Path


estimator1 = CPIEstimator_cls_normal(



        regressor =  RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                random_state=seed,
                n_jobs=n_jobs
                )

)


phi_0_cpi, se_0_cpi = estimator1.importance(X_full, y)


phi_0_cpi_test = phi_0_cpi 

se_0_cpi_test = se_0_cpi 

z_score_0_cpi = phi_0_cpi_test / se_0_cpi_test

p_value_0_cpi = 1 - norm.cdf(z_score_0_cpi)
rounded_p_value_0_cpi = np.round(p_value_0_cpi, 3)

print(rounded_p_value_0_cpi)

alpha = 0.05 / 21  


mask = (p_value_0_cpi <= alpha).astype(int)

print(mask)

df_out = pd.DataFrame({
    "feature": np.arange(len(phi_0_cpi)),
    "phi": phi_0_cpi,
    "std_error": se_0_cpi,
    "z_score": z_score_0_cpi,
    "p_value": p_value_0_cpi,
    "p_value_rounded": rounded_p_value_0_cpi,
    "sig_bonferroni_alpha_0.05/21": mask
})


output_csv = "ICLR_Flow_Disentangle/Real_data/Cardiotocography/CPI_0_results.csv"


Path(output_csv).parent.mkdir(parents=True, exist_ok=True)


df_out.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Saved CPI_0 results to: {output_csv}")




# FDFI

In [None]:
import sys
sys.path.append('ICLR_Flow_Disentangle')

import numpy as np
import torch


device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

from Flow_Matching.flow_matching import FlowMatchingModel


model = FlowMatchingModel(
    X=X_full,
    dim=D,
    device=device,
    hidden_dim=64,        
    time_embed_dim=64,     
    num_blocks=1,
    use_bn=False
)
model.fit(num_steps=10000, batch_size=256, lr=1e-3, show_plot=True)

In [None]:
from Inference.estimators_cls_cross import CPI_Flow_Model_Estimator_cls_normal
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from pathlib import Path
estimator8 = CPI_Flow_Model_Estimator_cls_normal(
    regressor =  RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                random_state=42,
                n_jobs=20
                ),


    flow_model=model

)


phi_x_cpi, se_x_cpi = estimator8.importance(X_full, y)



phi_x_cpi_test = phi_x_cpi 

se_x_cpi_test = se_x_cpi 

z_score_x_cpi = phi_x_cpi_test / se_x_cpi_test


p_value_x_cpi = 1 - norm.cdf(z_score_x_cpi)
rounded_p_value_x_cpi = np.round(p_value_x_cpi, 3)

print(rounded_p_value_x_cpi)

alpha = 0.05 / 21  

mask = (p_value_x_cpi <= alpha).astype(int)

print(mask)

Z = estimator8._encode_to_Z(X_full)
estimator8.plot_H(Z=Z, export_txt_path="H.tsv",
            savepath="H.png")

df_out = pd.DataFrame({
    "feature": np.arange(len(phi_x_cpi)),
    "phi": phi_x_cpi,
    "std_error": se_x_cpi,
    "z_score": z_score_x_cpi,
    "p_value": p_value_x_cpi,
    "p_value_rounded": rounded_p_value_x_cpi,
    "sig_bonferroni_alpha_0.05/21": mask
})


output_csv = "ICLR_Flow_Disentangle/Real_data/Cardiotocography/FDFI_X_results.csv"


Path(output_csv).parent.mkdir(parents=True, exist_ok=True)


df_out.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Saved FDFI_X results to: {output_csv}")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


top_k = 5
D = X_full.shape[1]


phi_safe = np.nan_to_num(phi_x_cpi, nan=-np.inf, posinf=-np.inf, neginf=-np.inf)


top_k = min(top_k, D)
selected_feature_idx = np.argsort(phi_safe)[-top_k:][::-1].tolist()

print(f"Top-{top_k} feature indices by FDFI:")
for rank, j in enumerate(selected_feature_idx, start=1):
    print(f"  #{rank}: idx={j:4d}, FDFI={phi_x_cpi[j]:.6f}, SE={se_x_cpi[j]:.6f}")


X_subset = X_full[:, selected_feature_idx]


seed = 42

n_jobs = 20
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_subset, y), start=1):
    X_tr, X_te = X_subset[tr_idx], X_subset[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=3,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy (selected features): {acc:.4f}")

print(f"Mean accuracy (2-fold, selected features): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")


print("Selected indices:", selected_feature_idx)


# DFI

In [None]:
from Inference.estimators_cls_cross import  DFIEstimator_cls
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from pathlib import Path




estimator4 = DFIEstimator_cls(
    regressor =  RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                random_state=seed,
                n_jobs=n_jobs
                )

)


phi_x_dfi, se_x_dfi = estimator4.importance(X_full, y)


phi_x_dfi_test = phi_x_dfi 

se_x_dfi_test = se_x_dfi 

z_score_x_dfi = phi_x_dfi_test / se_x_dfi_test

p_value_x_dfi = 1 - norm.cdf(z_score_x_dfi)
rounded_p_value_x_dfi = np.round(p_value_x_dfi, 3)

print(rounded_p_value_x_dfi)

alpha = 0.05 / 21  


mask = (p_value_x_dfi <= alpha).astype(int)

print(mask)

df_out = pd.DataFrame({
    "feature": np.arange(len(phi_x_dfi)),
    "phi": phi_x_dfi,
    "std_error": se_x_dfi,
    "z_score": z_score_x_dfi,
    "p_value": p_value_x_dfi,
    "p_value_rounded": rounded_p_value_x_dfi,
    "sig_bonferroni_alpha_0.05/21": mask
})


output_csv = "ICLR_Flow_Disentangle/Real_data/Cardiotocography\DFI_X_results.csv"


Path(output_csv).parent.mkdir(parents=True, exist_ok=True)


df_out.to_csv(output_csv, index=False, encoding="utf-8-sig")

print(f"Saved DFI_X results to: {output_csv}")


In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


top_k = 1
D = X_full.shape[1]


phi_safe = np.nan_to_num(phi_x_dfi, nan=-np.inf, posinf=-np.inf, neginf=-np.inf)

top_k = min(top_k, D)
selected_feature_idx = np.argsort(phi_safe)[-top_k:][::-1].tolist()

print(f"Top-{top_k} feature indices by DFI:")
for rank, j in enumerate(selected_feature_idx, start=1):
    print(f"  #{rank}: idx={j:4d}, DFI={phi_x_dfi[j]:.6f}, SE={se_x_dfi[j]:.6f}")


X_subset = X_full[:, selected_feature_idx]


seed = 42

n_jobs = 20
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_subset, y), start=1):
    X_tr, X_te = X_subset[tr_idx], X_subset[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=3,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy (selected features): {acc:.4f}")

print(f"Mean accuracy (2-fold, selected features): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")

print("Selected indices:", selected_feature_idx)


# ad hoc cluster+ CPI

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


x_npy_path = "ICLR_Flow_Disentangle/X_Cardiotocography.npy"
y_npy_path = "ICLR_Flow_Disentangle/y_Cardiotocography.npy"

X_full = np.load(x_npy_path)   
y = np.load(y_npy_path)

print("X shape:", X_full.shape)
print("y shape:", y.shape)

n, D = X_full.shape


ctg_names = [
    "LB","AC","FM","UC","DL","DS","DP",
    "ASTV","MSTV","ALTV","MLTV",
    "Width","Min","Max","Nmax","Nzeros","Mode","Mean","Median","Variance","Tendency"
]
feature_names = ctg_names if D == 21 else [f"f{i}" for i in range(D)]
X_df = pd.DataFrame(X_full, columns=feature_names)


corr = spearmanr(X_df).correlation
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1.0)
corr = np.nan_to_num(corr, nan=0.0, posinf=0.0, neginf=0.0)


distance_matrix = 1.0 - np.abs(corr)
linkage = hierarchy.ward(squareform(distance_matrix))


t = 0.7
cluster_ids = hierarchy.fcluster(linkage, t, criterion="distance")
cluster_to_ids = defaultdict(list)
for idx, cid in enumerate(cluster_ids):
    cluster_to_ids[cid].append(idx)


def select_representatives(
    X: np.ndarray,
    y: np.ndarray,
    cluster_to_ids: dict,
    corr_abs: np.ndarray,          
    dist: np.ndarray,               
    strategy: str = "medoid",
    random_state: int = 0
):

    rng = np.random.RandomState(random_state)
    selected = []

    for cid, ids in cluster_to_ids.items():
        ids = list(ids)
        if len(ids) == 1:
            selected.append(ids[0])
            continue

        if strategy == "first":
            chosen = ids[0]

        elif strategy == "medoid":

            subD = dist[np.ix_(ids, ids)]
            sums = np.sum(subD, axis=1)
            chosen = ids[int(np.argmin(sums))]
        selected.append(chosen)

    return selected


strategy = "medoid"   
selected_feature_idx = select_representatives(
    X_full, y, cluster_to_ids,
    corr_abs=np.abs(corr),
    dist=distance_matrix,
    strategy=strategy,
    random_state=0
)
selected_feature_names = [feature_names[i] for i in selected_feature_idx]
print(f"[{strategy}] selected numbers: {len(selected_feature_idx)}")
print(" selected names:", selected_feature_names)


X_subset = X_full[:, selected_feature_idx]


seed = 42
n_jobs = 20
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_subset, y), start=1):
    X_tr, X_te = X_subset[tr_idx], X_subset[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=3,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy (selected features): {acc:.4f}")

print(f"Mean accuracy (2-fold, selected features): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")


In [None]:
from Inference.estimators_cls_cross import CPIEstimator_cls_normal
from scipy.stats import norm
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from pathlib import Path


estimator1 = CPIEstimator_cls_normal(



        regressor =  RandomForestClassifier(
                n_estimators=300,
                max_depth=None,
                min_samples_leaf=3,
                random_state=seed,
                n_jobs=n_jobs
                )

)


phi_0_cpi, se_0_cpi = estimator1.importance(X_subset, y)


phi_0_cpi_test = phi_0_cpi 

se_0_cpi_test = se_0_cpi 

z_score_0_cpi = phi_0_cpi_test / se_0_cpi_test

p_value_0_cpi = 1 - norm.cdf(z_score_0_cpi)
rounded_p_value_0_cpi = np.round(p_value_0_cpi, 3)

print(rounded_p_value_0_cpi)

alpha = 0.05 / 10  

mask = (p_value_0_cpi <= alpha).astype(int)

print(mask)






In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

top_k = 5
D = X_full.shape[1]

phi_safe = np.nan_to_num(phi_0_cpi, nan=-np.inf, posinf=-np.inf, neginf=-np.inf)

# 取前 top_k 的索引（降序）
top_k = min(top_k, D)
selected_feature_idx = np.argsort(phi_safe)[-top_k:][::-1].tolist()

print(f"Top-{top_k} feature indices by CPI:")
for rank, j in enumerate(selected_feature_idx, start=1):
    print(f"  #{rank}: idx={j:4d}, CPI={phi_0_cpi[j]:.6f}, SE={se_0_cpi[j]:.6f}")


X_subset = X_full[:, selected_feature_idx]


seed = 42

n_jobs = 20
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

fold_acc = []
for k, (tr_idx, te_idx) in enumerate(skf.split(X_subset, y), start=1):
    X_tr, X_te = X_subset[tr_idx], X_subset[te_idx]
    y_tr, y_te = y[tr_idx], y[te_idx]

    clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=3,
        random_state=seed,
        n_jobs=n_jobs
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    fold_acc.append(acc)
    print(f"Fold {k} accuracy (selected features): {acc:.4f}")

print(f"Mean accuracy (2-fold, selected features): {np.mean(fold_acc):.4f} ± {np.std(fold_acc, ddof=1):.4f}")


print("Selected indices:", selected_feature_idx)


# Comparison between LOCO, CPI, DFI, and FDFI

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from io import StringIO
import numpy as np
import matplotlib.patches as mpatches
from pathlib import Path
import re


files = {
    "LOCO": r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Cardiotocography\LOCO_0_results.csv",
    "DFI":  r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Cardiotocography\DFI_X_results.csv",
    "CPI":  r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Cardiotocography\CPI_0_results.csv",
    "FDFI": r"F:\Code\2025\ICLR_Flow_Disentangle\Real_data\Cardiotocography\FDFI_X_results.csv",
}
methods = list(files.keys())


color_map = {
    "LOCO": "#4C78A8",  
    "CPI":  "#F58518",   
    "FDFI": "#E45756",   
    "DFI":  "#B279A2",   
}


def normalize_col_name(s: str) -> str:
    s = s.strip().lower()
    s = re.sub(r"[^0-9a-z]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return s

def find_column(df: pd.DataFrame, candidates_exact=None, must_contain_all=None):

    norm_map = {col: normalize_col_name(col) for col in df.columns}
    inv_map = {}
    for orig, norm in norm_map.items():
        inv_map.setdefault(norm, []).append(orig)

    if candidates_exact:
        for cand in candidates_exact:
            cand_norm = normalize_col_name(cand)
            if cand_norm in inv_map:
                return inv_map[cand_norm][0]

    if must_contain_all:
        for orig, norm in norm_map.items():
            if all(k in norm for k in must_contain_all):
                return orig

    return None

dfs = []
single_star_by_idx = {m: set() for m in methods}
double_star_by_idx = {m: set() for m in methods}

for name, path in files.items():
    df = pd.read_csv(path) 

    feat_col = find_column(
        df,
        candidates_exact=["Feature", "feature", "index"],
        must_contain_all=["feature"]
    )

    phi_col = find_column(
        df,
        candidates_exact=["phi", "Φ", "FI", "Importance"],
        must_contain_all=["phi"]
    )
    if phi_col is None:
        phi_col = find_column(df, must_contain_all=["fi"]) or find_column(df, must_contain_all=["importance"])


    p_col = find_column(
        df,
        candidates_exact=["p_value_rounded"],
        must_contain_all=["p", "value", "rounded"]
    )
    if p_col is None:
        p_col = find_column(df, must_contain_all=["p", "value"])
    if p_col is None:
        pass

    bonf_col = find_column(
        df,
        candidates_exact=["sig_bonferroni_alpha_0.05/21", "sig_bonferroni_alpha_0_05_21"],
        must_contain_all=["sig", "bonferroni"]
    )
    feat_idx_series = pd.to_numeric(df[feat_col], errors="coerce")

    if p_col is not None:
        mask_single = pd.to_numeric(df[p_col], errors="coerce") < 0.05
        idx_single = set(int(i) for i in feat_idx_series[mask_single].dropna().unique())
        single_star_by_idx[name] |= idx_single

    if bonf_col is not None:
        mask_double = pd.to_numeric(df[bonf_col], errors="coerce") == 1
        idx_double = set(int(i) for i in feat_idx_series[mask_double].dropna().unique())
        double_star_by_idx[name] |= idx_double

    sub = df[[feat_col, phi_col]].copy()
    sub = sub.rename(columns={feat_col: "Feature", phi_col: f"{name}_phi"})
    dfs.append(sub)

df_all = dfs[0]
for i in range(1, len(dfs)):
    df_all = df_all.merge(dfs[i], on="Feature", how="outer").fillna(0)


df_all["Feature_raw"] = df_all["Feature"]

feature_map = {
     0: "LB",      1: "AC",      2: "FM",      3: "UC",      4: "ASTV",
     5: "MSTV",    6: "ALTV",    7: "MLTV",    8: "DL",      9: "DS",
    10: "DP",     11: "Width",  12: "Min",    13: "Max",    14: "Nmax",
    15: "Nzeros", 16: "Mode",   17: "Mean",   18: "Median", 19: "Variance",
    20: "Tendency"
}
def map_feature(v):
    try:
        return feature_map.get(int(v), v)
    except Exception:
        return v
df_all["Feature"] = df_all["Feature"].apply(map_feature)

phi_cols = [c for c in df_all.columns if c.endswith("_phi")]
sort_key = "FDFI_phi"


df_top = df_all.sort_values(sort_key, ascending=False).head(21).reset_index(drop=True)


n_methods = len(methods)
group_spacing = 1.6                          
x = np.arange(len(df_top)) * group_spacing   
bar_width = 0.22                              
group_width = bar_width * n_methods
offsets = np.linspace(-group_width/2 + bar_width/2,
                      group_width/2 - bar_width/2, n_methods)

fig, ax = plt.subplots(1, 1, figsize=(19, 5.2))

ymax = float(df_top[phi_cols].max().max())
eps = 1e-8
txt_off = 0.03 * max(ymax, eps)
plot_max = ymax + txt_off + 0.06 * max(ymax, eps)
ax.set_ylim(0, plot_max)

sym_xoff = 0.14 * bar_width

for i, m in enumerate(methods):
    vals = df_top[f"{m}_phi"].to_numpy()
    x_pos = x + offsets[i]

    bars = ax.bar(
        x_pos, vals, width=bar_width,
        edgecolor="black", color=color_map[m], label=m, linewidth=0.7
    )

    for j, bar in enumerate(bars):
        h = bar.get_height()
        x_mid = bar.get_x() + bar.get_width()/2.0

        raw_val = df_top.loc[j, "Feature_raw"]
        try:
            raw_idx = int(raw_val)
        except Exception:
            raw_idx = None

        symbol = None
        if raw_idx is not None:
            if raw_idx in double_star_by_idx.get(m, set()):
                symbol = "**"
            elif raw_idx in single_star_by_idx.get(m, set()):
                symbol = "*"

        if symbol and np.isfinite(h):
            ax.text(
                x_mid + sym_xoff, h + txt_off, symbol,
                va="center", ha="center", fontsize=14.5, fontweight="bold"
            )

ax.grid(axis="y", linestyle="--", alpha=0.3)
ax.set_ylabel("Feature Importance", fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(df_top["Feature"], fontsize=20, rotation=28, ha="right")
ax.tick_params(axis="y", labelsize=20)


method_handles = [mpatches.Patch(color=color_map[m], label=m) for m in methods]
fig.legend(
    handles=method_handles,
    loc="center left",
    bbox_to_anchor=(1.005, 0.6),
    frameon=False,
    ncol=1,
    handlelength=1.5,
    handletextpad=0.8,
    labelspacing=3,
    borderaxespad=0.0,
    fontsize=20
)

fig.tight_layout()

out_dir = Path(files["LOCO"]).parent
out_pdf = out_dir / "CTG_feature_importance_stars_single_panel_widebars_widegaps.pdf"
plt.savefig(out_pdf, format="pdf", bbox_inches="tight", pad_inches=0., dpi=300)
plt.show()
print(f"PDF has been saved at {out_pdf}")
