In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter  # ← add this import

# CONFIG
PHENOTYPE_FILE = "Phenotypic_V1_0b_preprocessed1.csv"  # Phenotypic CSV
ROIS_DIR       = "nyu_rois"                            # folder with *_rois_cc200.1D files
OUTPUT_X       = "cc200_fc_X_children.npy"
OUTPUT_Y       = "cc200_fc_y_children.npy"

#  LOAD PHENOTYPE DATA 
df = pd.read_csv(PHENOTYPE_FILE)
df = df[(df["AGE_AT_SCAN"] < 18) & df["DX_GROUP"].isin([1, 2])]
label_dict = dict(zip(df["FILE_ID"], df["DX_GROUP"] - 1))  # ASD=1, Control=0


TR_SEC        = 2.0               # Known TR for NYU (seconds)
VOXEL_SIZES   = (3.0, 3.0, 4.0)   # Known voxel dimensions (mm)


# Prepare lists for QC
subject_ids = []
sex_list    = []
FIQ_list    = []
num_vols    = []
scan_durs   = []
TR_list     = []
voxelX_list = []
voxelY_list = []
voxelZ_list = []

#  PROCESS FUNCTIONAL CONNECTIVITY ===
X = []
y = []
missing = []
subject_ids = []  # ensure this list exists

#  Make sure we collect subject IDs and sexes during processing ---
subject_ids = []
sex_list    = []

# (Inside your processing loop, when you append to X and y:)
for subj_id, label in tqdm(label_dict.items()):
    matches = [f for f in os.listdir(ROIS_DIR) if subj_id in f and f.endswith(".1D")]
    if not matches:
        missing.append(subj_id)
        continue

    file_path = os.path.join(ROIS_DIR, matches[0])
    try:
        data = np.loadtxt(file_path)       # (T,200)
        if data.shape[1] != 200:
            print(f"Skipped {subj_id}: unexpected shape {data.shape}")
            continue

        # connectivity features
        corr  = np.corrcoef(data.T)        # (200,200)
        upper = corr[np.triu_indices(200, k=1)]
        X.append(upper)
        y.append(label)

        # record subject-level QC info
        subject_ids.append(subj_id)

        # Sex (1=M,2=F in ABIDE)
        sex_code = int(df.loc[df["FILE_ID"] == subj_id, "SEX"].values[0])
        sex_list.append("M" if sex_code == 1 else "F")

        # Full-Scale IQ
        FIQ = float(df.loc[df["FILE_ID"] == subj_id, "FIQ"].values[0])
        FIQ_list.append(FIQ)

        # Number of volumes & scan duration
        nv = data.shape[0]
        num_vols.append(nv)
        scan_durs.append(TR_SEC * nv / 60.0)  # minutes

        # Constant TR & voxel sizes
        TR_list.append(TR_SEC)
        voxelX_list.append(VOXEL_SIZES[0])
        voxelY_list.append(VOXEL_SIZES[1])
        voxelZ_list.append(VOXEL_SIZES[2])

    except Exception as e:
        print(f"Error with {file_path}: {e}")
        missing.append(subj_id)

# CONVERT TO ARRAYS ===
X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.uint8)

# BUILD QC DATAFRAME
qc_df = pd.DataFrame({
    "subject_id":   subject_ids,
    "label":        y,
    "sex":          sex_list,
    "FIQ":          FIQ_list,
    "TR_sec":       TR_list,
    "NumVols":      num_vols,
    "ScanDur_min":  scan_durs,
    "VoxelX_mm":    voxelX_list,
    "VoxelY_mm":    voxelY_list,
    "VoxelZ_mm":    voxelZ_list
})
qc_df["group"] = qc_df["label"].map({0: "Control", 1: "ASD"})

# PRINT COUNTS ===
print("\n=== Sample Counts by Group ===")
print(qc_df["group"].value_counts())

print("\n=== Sample Counts by Group & Sex ===")
print(pd.crosstab(qc_df["group"], qc_df["sex"]))

# SUMMARY STATS TABLE ===
summary = qc_df.groupby("group").agg({
    "FIQ":         ["mean", "std"],
    "TR_sec":      ["mean", "std"],
    "ScanDur_min": ["mean", "std"],
    "VoxelX_mm":   ["mean", "std"],
    "VoxelY_mm":   ["mean", "std"],
    "VoxelZ_mm":   ["mean", "std"]
})
print("\n=== Phenotypic & Acquisition Summary ===")
print(summary.to_markdown())



# SAVE OUTPUT ===
print("Final Data shape:", X.shape)
print("Labels shape:   ", y.shape)
print(f"Missing or failed: {len(missing)} subjects")

np.save(OUTPUT_X, X)
np.save(OUTPUT_Y, y)
