In [None]:
import os
import pandas as pd

data_path = r""

# ============================================================
# One-stop pipeline:
# - loads train/test/external LE + RE CSVs
# - standardizes mask id column -> "mask_name"
# - checks LE vs RE outcomes (per split) and reports mismatches
# - merges LE+RE features on mask_name
# - returns finished dataframes with ["mask_name", "outcome", ...]
# - saves merged CSVs (optional)
# ============================================================

def _standardize_mask_id(df: pd.DataFrame) -> pd.DataFrame:
    """Make sure the mask/path identifier column is named 'mask_name' and drop junk index columns."""
    df = df.copy()

    # common case: double-saved index column + actual mask column
    if "Unnamed: 0.1" in df.columns and "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0.1"], errors="ignore")
        df = df.rename(columns={"Unnamed: 0": "mask_name"})
    elif "Unnamed: 0" in df.columns:
        df = df.rename(columns={"Unnamed: 0": "mask_name"})

    if "mask_name" not in df.columns:
        raise ValueError("Could not find mask identifier column. Expected 'Unnamed: 0' or 'mask_name'.")

    return df


def _load_features_csv(path: str) -> pd.DataFrame:
    """Read CSV robustly and standardize id column."""
    df = pd.read_csv(path, low_memory=False)
    df = _standardize_mask_id(df)
    return df


def _dedup_on_mask(df: pd.DataFrame, file_label: str) -> pd.DataFrame:
    """Drop duplicate mask_name rows (keep first) and warn."""
    df = df.copy()
    dups = df["mask_name"].duplicated().sum()
    if dups:
        print(f"WARNING [{file_label}]: {dups} duplicate mask_name rows found; keeping first occurrence.")
        df = df.drop_duplicates(subset=["mask_name"], keep="first")
    return df


def _merge_split(le_df: pd.DataFrame, re_df: pd.DataFrame, split_name: str,
                 prefix_le="LE__", prefix_re="RE__",
                 prefer="LE", show_mismatches=10) -> pd.DataFrame:
    """
    Merge LE + RE on mask_name, keeping one outcome column.
    - Checks outcome consistency (where both present).
    - Outcome resolution: prefer LE else RE (or vice versa).
    """
    le_df = _dedup_on_mask(le_df, f"{split_name}/LE")
    re_df = _dedup_on_mask(re_df, f"{split_name}/RE")

    # Ensure outcome exists
    if "outcome" not in le_df.columns or "outcome" not in re_df.columns:
        raise ValueError(f"[{split_name}] Both LE and RE must contain an 'outcome' column.")

    # Coerce outcome to numeric (handles strings like "3")
    le_df = le_df.copy()
    re_df = re_df.copy()
    le_df["outcome"] = pd.to_numeric(le_df["outcome"], errors="coerce")
    re_df["outcome"] = pd.to_numeric(re_df["outcome"], errors="coerce")

    # Align outcomes on mask_name and check conflicts
    meta = pd.merge(
        le_df[["mask_name", "outcome"]],
        re_df[["mask_name", "outcome"]],
        on="mask_name",
        how="inner",
        suffixes=("_le", "_re"),
    )

    both_present = meta["outcome_le"].notna() & meta["outcome_re"].notna()
    conflict = both_present & (meta["outcome_le"] != meta["outcome_re"])

    print(f"\n[{split_name}] rows LE={len(le_df)} | RE={len(re_df)} | matched={len(meta)}")
    print(f"[{split_name}] outcomes both present={both_present.sum()} | conflicts={conflict.sum()}")

    if conflict.any():
        print(f"[{split_name}] Example outcome conflicts (showing up to {show_mismatches}):")
        print(meta.loc[conflict, ["mask_name", "outcome_le", "outcome_re"]].head(show_mismatches))

    # Resolve to single outcome
    if prefer.upper() == "LE":
        meta["outcome"] = meta["outcome_le"].combine_first(meta["outcome_re"])
    else:
        meta["outcome"] = meta["outcome_re"].combine_first(meta["outcome_le"])
    meta = meta[["mask_name", "outcome"]]

    # Prefix features (exclude mask_name/outcome)
    le_feat = le_df.drop(columns=["mask_name", "outcome"], errors="ignore").add_prefix(prefix_le)
    re_feat = re_df.drop(columns=["mask_name", "outcome"], errors="ignore").add_prefix(prefix_re)

    le_pref = pd.concat([le_df[["mask_name"]], le_feat], axis=1)
    re_pref = pd.concat([re_df[["mask_name"]], re_feat], axis=1)

    # Merge meta + features
    merged = (
        meta.merge(le_pref, on="mask_name", how="inner")
            .merge(re_pref, on="mask_name", how="inner")
    )

    # Ensure first two columns are mask_name, outcome
    cols = ["mask_name", "outcome"] + [c for c in merged.columns if c not in ["mask_name", "outcome"]]
    merged = merged[cols]

    return merged


def build_all_splits(
    base_dir: str,
    *,
    train_re="train_features_true_mask.csv",
    train_le="train_features_true_mask_low_energy.csv",
    test_re="test_features_true_mask.csv",
    test_le="test_features_true_mask_low_energy.csv",
    external_re="external_features_true_mask.csv",
    external_le="external_features_true_mask_low_energy_v2.csv",  # set to v2 by default
    prefix_le="LE__",
    prefix_re="RE__",
    prefer_outcome="LE",
    save_merged=True,
    out_suffix="_merged_LE_RE.csv",
):
    """
    Returns: (train_df, test_df, external_df) with mask_name + outcome first.
    Also saves to CSV if save_merged=True.
    """
    def p(name): return os.path.join(base_dir, name)

    # Load
    train_le_df = _load_features_csv(p(train_le))
    train_re_df = _load_features_csv(p(train_re))
    test_le_df  = _load_features_csv(p(test_le))
    test_re_df  = _load_features_csv(p(test_re))
    ext_le_df   = _load_features_csv(p(external_le))
    ext_re_df   = _load_features_csv(p(external_re))

    # Merge each split
    train_df = _merge_split(train_le_df, train_re_df, "TRAIN",
                            prefix_le=prefix_le, prefix_re=prefix_re, prefer=prefer_outcome)
    test_df  = _merge_split(test_le_df,  test_re_df,  "TEST",
                            prefix_le=prefix_le, prefix_re=prefix_re, prefer=prefer_outcome)
    ext_df   = _merge_split(ext_le_df,   ext_re_df,   "EXTERNAL",
                            prefix_le=prefix_le, prefix_re=prefix_re, prefer=prefer_outcome)

    # Save outputs
    if save_merged:
        train_out = p("train" + out_suffix)
        test_out  = p("test" + out_suffix)
        ext_out   = p("external" + out_suffix)

        train_df.to_csv(train_out, index=False)
        test_df.to_csv(test_out, index=False)
        ext_df.to_csv(ext_out, index=False)

        print("\nSaved merged files:")
        print(" -", train_out)
        print(" -", test_out)
        print(" -", ext_out)

    return train_df, test_df, ext_df


# =========================
# USAGE (run this)
# =========================
# base_dir should be your folder that contains the 6 CSVs (and v2 LE external if present)
# You already have data_path set earlier; use it here:
train_df, test_df, external_df = build_all_splits(
    base_dir=data_path,
    external_le="external_features_true_mask_low_energy_v2.csv",  # change if your v2 file name differs
    save_merged=True
)

# Now you have finished dataframes in memory:
# train_df, test_df, external_df


In [None]:
(train_df["LE__original_shape2D_MajorAxisLength"]
 == train_df["RE__original_shape2D_MajorAxisLength"]).all()


In [None]:
diff = train_df["LE__original_shape2D_MajorAxisLength"] - train_df["RE__original_shape2D_MajorAxisLength"]
diff.abs().max()


In [None]:
import numpy as np

le_cols = [c for c in train_df.columns if c.startswith("LE__")]
re_cols = [c for c in train_df.columns if c.startswith("RE__")]

le_map = {c.replace("LE__", ""): c for c in le_cols}
re_map = {c.replace("RE__", ""): c for c in re_cols}

common = sorted(set(le_map) & set(re_map))

identical = []
different = []

for feat in common:
    a = train_df[le_map[feat]]
    b = train_df[re_map[feat]]

    if np.allclose(a, b, rtol=1e-6, atol=1e-8, equal_nan=True):
        identical.append(feat)
    else:
        different.append(feat)

print(f"Identical features: {len(identical)}")
print(f"Different features: {len(different)}")
