# NB1 — Setup + Dataset Registry

Quality control, dataset curation, environment logging, and generation of Supplementary Table S1.

**Paper:** Zafar SA, Qin W. *Thymus-Derived Myeloid Education Signatures Predict Microglial Tolerance Positioning and Are Modulated by Glucocorticoid Stress-Axis Activity.* Neuroimmunomodulation (2026).

> **Note:** Update the path variables in section 0 to match your local directory structure before running. Raw data can be obtained from the public repositories listed in Supplementary Table S1.


In [None]:
import os, re, json, warnings, math, platform, sys
from pathlib import Path
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

warnings.filterwarnings("ignore")

# 0) Figure style

plt.rcParams.update({
    "font.family": "Arial",
    "font.size": 8,
    "axes.labelsize": 8,
    "axes.titlesize": 9,
    "xtick.labelsize": 7,
    "ytick.labelsize": 7,
    "legend.fontsize": 7,
    "figure.titlesize": 9,
    "axes.linewidth": 0.8,
})
FIG_DPI = 1200


# 1) Paths + folders 

BASE_DIR = Path(".")  # <-- SET TO YOUR PROJECT ROOT
RAW_DIR  = BASE_DIR / "Raw Data"
PROC_DIR = BASE_DIR / "Process Data"


MANUSCRIPT_DIR = BASE_DIR / "outputs" / "manuscript", MANUSCRIPT_DIR_CANDIDATES[-1])
FIG_DIR = MANUSCRIPT_DIR / "Figures"
TAB_DIR = MANUSCRIPT_DIR / "Tables"

for d in [
    PROC_DIR / "aim1_thymus",
    PROC_DIR / "aim2_microglia",
    PROC_DIR / "aim3_stress",
    PROC_DIR / "spatial_validation",
    PROC_DIR / "negctrl",
    PROC_DIR / "innate_memory",
    FIG_DIR,
    TAB_DIR,
]:
    d.mkdir(parents=True, exist_ok=True)

assert RAW_DIR.exists(), f"RAW_DIR not found: {RAW_DIR}"

print("BASE_DIR:", BASE_DIR)
print("RAW_DIR :", RAW_DIR)
print("PROC_DIR:", PROC_DIR)
print("MANUSCRIPT_DIR:", MANUSCRIPT_DIR)
print("FIG_DIR :", FIG_DIR)
print("TAB_DIR :", TAB_DIR)


# 2) Save helpers

def save_fig(fig, fname: str, kind: str = "Supplementary"):
    assert kind in ("Main", "Supplementary")
    out = FIG_DIR / f"{kind}_{fname}.png"
    fig.savefig(out, dpi=FIG_DPI, bbox_inches="tight")
    plt.close(fig)
    print(f"[SAVED FIG] {out}")

def save_table_xlsx(df: pd.DataFrame, fname: str, kind: str = "Supplementary",
                    sheet_name: str = "Sheet1"):
    assert kind in ("Main", "Supplementary")
    out = TAB_DIR / f"{kind}_{fname}.xlsx"
    with pd.ExcelWriter(out, engine="openpyxl") as w:
        df.to_excel(w, index=False, sheet_name=sheet_name[:31])
    print(f"[SAVED TABLE] {out}")


# 3) Environment logging

def capture_environment() -> pd.DataFrame:
    rows = [
        {"category": "system", "key": "python_version", "value": sys.version},
        {"category": "system", "key": "platform", "value": platform.platform()},
        {"category": "system", "key": "architecture", "value": platform.machine()},
        {"category": "system", "key": "timestamp", "value": datetime.now().isoformat()},
    ]
    packages = [
        "numpy", "pandas", "scipy", "scanpy", "anndata", "matplotlib",
        "seaborn", "sklearn", "scrublet", "openpyxl", "h5py",
    ]
    for pkg in packages:
        try:
            mod = __import__(pkg)
            ver = getattr(mod, "__version__", "installed (version unknown)")
        except ImportError:
            ver = "NOT INSTALLED"
        rows.append({"category": "package", "key": pkg, "value": ver})

    return pd.DataFrame(rows)

df_env = capture_environment()
print("\n[ENVIRONMENT]")
for _, r in df_env.iterrows():
    print(f"  {r['key']:15s} {r['value']}")


# 4) Dataset registry

REG = {
    "TS_Thymus_filtered": {
        "modality": "sc/snRNA (AnnData)",
        "path": RAW_DIR / "Thymus" / "TS_Thymus_filtered.h5ad",
        "notes": "Aim1 discovery thymus anchor (MES training).",
    },
    "TabulaSapiens_full": {
        "modality": "multi-tissue (AnnData)",
        "path": RAW_DIR / "Thymus" / "GSE201333" / "GSM6058681_TabulaSapiens.h5ad",
        "notes": "Huge backup reference; avoid loading unless necessary.",
    },
    "GSE144870_Lavaert": {
        "modality": "10x (flat mtx/tsv)",
        "path": RAW_DIR / "Thymus" / "GSE144870",
        "notes": "Flat GSM*_matrix.mtx + *_genes/features.tsv + *_barcodes.tsv.",
    },
    "GSE139042_Le": {
        "modality": "dense tables (txt)",
        "path": RAW_DIR / "Thymus" / "GSE139042",
        "notes": "Large thymus counts tables; load carefully.",
    },
    "GSE133341_Zeng": {
        "modality": "tables (txt) mixed tissues",
        "path": RAW_DIR / "Thymus" / "GSE133341",
        "notes": "Contains thymus + non-thymus; filter thymus later.",
    },
    "TS_Blood_NegCtrl": {
        "modality": "scRNA (AnnData)",
        "path": RAW_DIR / "Peripheral_Myeloid_NegCtrl" / "Tabula_Sapiens_Blood.h5ad",
        "notes": "MANDATORY NEGCTRL. Use only for specificity tests. NEVER for MES training.",
    },
    "SEAAD_microglia": {
        "modality": "snRNA (AnnData)",
        "path": RAW_DIR / "Brain" / "SEA-AD (AllenBrainMap)" / "SEA-AD_Microglia-and-Immune_multi-regional_final-nuclei_AAIC-pre-release.2025-07-24.h5ad",
        "notes": "Aim2 discovery microglia anchor.",
    },
    "SEAAD_MTG_taxonomy": {
        "modality": "snRNA (AnnData) huge",
        "path": RAW_DIR / "Brain" / "SEA-AD MTG Whole Taxonomy (CELLxGENE Census artifact; AnnData).h5ad",
        "notes": "Very large. Optional reference only.",
    },
    "Olah_live_microglia": {
        "modality": "scRNA (AnnData)",
        "path": RAW_DIR / "Brain" / "Olah et al. live human microglia (CELLxGENE Census artifact; AnnData).h5ad",
        "notes": "External replication (living human microglia).",
    },
    "HuMicA": {
        "modality": "Seurat (RDS)",
        "path": RAW_DIR / "Brain" / "HuMicA.rds",
        "notes": "Atlas in Seurat; optional conversion later.",
    },
    "GSE204702_Tuddenham": {
        "modality": "10x HDF5 (filtered_feature_bc_matrix.h5)",
        "path": RAW_DIR / "Cross-disease living human microglia (Tuddenham  De Jager lab)",
        "notes": "External replication across diseases; GSM*_filtered_feature_bc_matrix.h5 files.",
    },
    "GSE138852_AD_snRNA": {
        "modality": "counts+covariates (csv)",
        "path_counts": RAW_DIR / "GSE138852" / "GSE138852_counts.csv",
        "path_cov": RAW_DIR / "GSE138852" / "GSE138852_covariates.csv",
        "notes": "AD snRNA validation dataset (counts huge; covariates small).",
    },
    "GSE180759_MS_lesion": {
        "modality": "counts+annotation (csv/txt)",
        "path_counts": RAW_DIR / "MS lesion snRNA" / "Absinta et al. progressive MS lesion edge snRNA (GEO GSE180759)" / "GSE180759_expression_matrix.csv",
        "path_anno": RAW_DIR / "MS lesion snRNA" / "Absinta et al. progressive MS lesion edge snRNA (GEO GSE180759)" / "GSE180759_annotation.txt",
        "notes": "Supplementary-only generalization; not load-bearing.",
    },
    "GSE135618_sc_vs_sn": {
        "modality": "10x (flat mtx/tsv)",
        "path": RAW_DIR / "scRNA vs snRNA comparison",
        "notes": "Technical supplement/QC; do not make load-bearing.",
    },
    "GSE220442_Visium": {
        "modality": "Visium 10x (mtx/tsv + spatial)",
        "path": RAW_DIR / "Spatial transcriptomics" / "GSE220442" / "counts_and_images",
        "notes": "Spatial validation (supp or 1 main figure).",
    },
    "GSE233208_paired": {
        "modality": "Seurat (RDS)",
        "path_sn": RAW_DIR / "GSE233208" / "GSE233208_Human_snRNA-Seq_ADDS_integrated.rds",
        "path_vis": RAW_DIR / "GSE233208" / "GSE233208_Human_visium_ADDS_seurat_processed.rds",
        "notes": "Cross-modal validation.",
    },
    "GSE219208_stress": {
        "modality": "counts table (csv)",
        "path": RAW_DIR / "GSE219208" / "GSE219208_Non-Normalized_read_counts_combined_lanes_.csv",
        "notes": "Aim3 glucocorticoid perturbation calibration (human microglia).",
    },
    "GSE229940_innate_memory": {
        "modality": "bulk microglia RNA-seq (processed count matrix)",
        "path_counts": RAW_DIR / "Innate_Immune_Memory" / "GSE229940" / "GSE229940_countmtx.csv",
        "notes": "Innate immune memory/priming anchor (supplementary).",
    },
    "GSE184241_innate_memory_LPS": {
        "modality": "Expression matrix (bulk/pooled counts)",
        "path_counts": RAW_DIR / "GSE184241_combined_raw_counts.txt.gz",
        "notes": "Innate memory stimulation (LPS vs RPMI) – MES module profiles. "
                 "Condition encoded in sample IDs: _LPS_ / _RPMI_. "
                 "See NB4 for reader (read_gse184241_counts).",
    },
}


# 5) Dataset metadata (expanded: citation, access date, data availability)

DATASET_META = {
    "TS_Thymus_filtered": dict(
        dataset_name="Tabula Sapiens (Thymus subset; filtered AnnData)",
        accession_or_source="Tabula Sapiens",
        tissue="Thymus", disease_or_condition="Normal",
        assay="sc/snRNA-seq (processed AnnData)", organism="Homo sapiens",
        role="Aim1 Discovery (MES training)", aims="Aim1",
        url="https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
        key_annotations="cell_type; donor/subject; age/sex; assay",
        citation="Tabula Sapiens Consortium et al., Science 2022",
        access_date="2025", data_availability="Publicly available via CELLxGENE",
    ),
    "TS_Blood_NegCtrl": dict(
        dataset_name="Tabula Sapiens — Blood (peripheral myeloid negative control)",
        accession_or_source="Tabula Sapiens",
        tissue="Blood (peripheral)", disease_or_condition="Normal",
        assay="scRNA-seq (AnnData)", organism="Homo sapiens",
        role="NEGCTRL (monocytes/macrophages only)", aims="Specificity control",
        url="https://cellxgene.cziscience.com/collections/e5f58829-1a66-40b5-a624-9046778e74f5",
        key_annotations="cell_type; donor/subject; age/sex",
        citation="Tabula Sapiens Consortium et al., Science 2022",
        access_date="2025", data_availability="Publicly available via CELLxGENE",
    ),
    "SEAAD_microglia": dict(
        dataset_name="SEA-AD microglia + immune (multi-regional snRNA)",
        accession_or_source="SEA-AD",
        tissue="Human brain (multi-regional)", disease_or_condition="AD spectrum / control",
        assay="snRNA-seq (AnnData)", organism="Homo sapiens",
        role="Aim2 Discovery", aims="Aim2",
        url="https://cellxgene.cziscience.com/collections/1ca90a2d-2943-483d-b678-b809bf464c30",
        key_annotations="cell_type; donor; brain region; diagnosis; age/sex",
        citation="Gabitto et al., bioRxiv 2023 (SEA-AD)",
        access_date="2025", data_availability="Publicly available via CELLxGENE",
    ),
    "Olah_live_microglia": dict(
        dataset_name="Living human microglia (Olah et al.; scRNA)",
        accession_or_source="CELLxGENE / Olah et al.",
        tissue="Human brain (living microglia)", disease_or_condition="Varies",
        assay="scRNA-seq (AnnData)", organism="Homo sapiens",
        role="Aim2 External Replication", aims="Aim2",
        url="https://cellxgene.cziscience.com/",
        key_annotations="cell_type; donor; region",
        citation="Olah et al., Nat Neurosci 2020",
        access_date="2025", data_availability="Publicly available via CELLxGENE",
    ),
    "GSE204702_Tuddenham": dict(
        dataset_name="Cross-disease living human microglia (Tuddenham / De Jager)",
        accession_or_source="GSE204702",
        tissue="Human brain (live microglia)", disease_or_condition="Multiple diseases + controls",
        assay="scRNA-seq (10x HDF5)", organism="Homo sapiens",
        role="Aim2 External Replication", aims="Aim2",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE204702",
        key_annotations="donor; disease; region; cell_type",
        citation="Tuddenham et al., 2024",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE233208_paired": dict(
        dataset_name="AD/DS paired snRNA + Visium",
        accession_or_source="GSE233208",
        tissue="Human brain", disease_or_condition="AD / DS",
        assay="snRNA-seq + Visium (Seurat)", organism="Homo sapiens",
        role="Spatial/Cross-modal Validation", aims="Aim2 + Spatial",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE233208",
        key_annotations="diagnosis; region; matched samples",
        citation="See GEO entry",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE138852_AD_snRNA": dict(
        dataset_name="AD snRNA-seq atlas (validation)",
        accession_or_source="GSE138852",
        tissue="Human brain", disease_or_condition="AD vs control",
        assay="snRNA-seq (CSV)", organism="Homo sapiens",
        role="Supplementary Validation", aims="Aim2",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE138852",
        key_annotations="diagnosis; age/sex; cell_type",
        citation="Mathys et al., Nature 2019",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE219208_stress": dict(
        dataset_name="Microglia glucocorticoid perturbation",
        accession_or_source="GSE219208",
        tissue="Human microglia (in vitro)", disease_or_condition="Dex/Hydrocortisone",
        assay="RNA-seq (counts table)", organism="Homo sapiens",
        role="Aim3 Calibration", aims="Aim3",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE219208",
        key_annotations="treatment; dose; timepoint; replicate",
        citation="See GEO entry",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE229940_innate_memory": dict(
        dataset_name="Microglia innate immune memory/priming",
        accession_or_source="GSE229940",
        tissue="Microglia (isolated)", disease_or_condition="Priming +/- LPS",
        assay="Bulk RNA-seq", organism="Rattus norvegicus",
        role="Mechanistic anchor (supplement)", aims="Supplement",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE229940",
        key_annotations="treatment group; time; replicate",
        citation="See GEO entry",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE220442_Visium": dict(
        dataset_name="Visium spatial transcriptomics",
        accession_or_source="GSE220442",
        tissue="Human brain (spatial)", disease_or_condition="As released",
        assay="10x Visium", organism="Homo sapiens",
        role="Spatial Validation", aims="Spatial",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE220442",
        key_annotations="spot coordinates; images; sample metadata",
        citation="See GEO entry",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE135618_sc_vs_sn": dict(
        dataset_name="scRNA vs snRNA technical comparison",
        accession_or_source="GSE135618",
        tissue="Varies", disease_or_condition="As released",
        assay="10x (mtx/tsv)", organism="Homo sapiens",
        role="Technical QC (supplement)", aims="QC",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE135618",
        key_annotations="assay; processing; QC metrics",
        citation="See GEO entry",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE180759_MS_lesion": dict(
        dataset_name="Progressive MS lesion-edge snRNA-seq",
        accession_or_source="GSE180759",
        tissue="Human brain lesions", disease_or_condition="MS",
        assay="snRNA-seq (CSV + annotation)", organism="Homo sapiens",
        role="Generalization (supplement)", aims="Supplement",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE180759",
        key_annotations="lesion region; cell_type; donor",
        citation="Absinta et al., Nature 2021",
        access_date="2025", data_availability="GEO public",
    ),
    "TabulaSapiens_full": dict(
        dataset_name="Tabula Sapiens — All tissues (backup)",
        accession_or_source="GSE201333",
        tissue="Multi-tissue", disease_or_condition="Normal",
        assay="Multi-assay (AnnData; huge)", organism="Homo sapiens",
        role="Backup only", aims="NA",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE201333",
        key_annotations="multi-tissue; multi-assay",
        citation="Tabula Sapiens Consortium et al., Science 2022",
        access_date="2025", data_availability="GEO / CELLxGENE public",
    ),
    "SEAAD_MTG_taxonomy": dict(
        dataset_name="SEA-AD MTG whole taxonomy (optional)",
        accession_or_source="SEA-AD / CELLxGENE",
        tissue="Human brain (MTG)", disease_or_condition="AD spectrum / control",
        assay="snRNA-seq (AnnData; huge)", organism="Homo sapiens",
        role="Reference only", aims="NA",
        url="https://cellxgene.cziscience.com/collections/1ca90a2d-2943-483d-b678-b809bf464c30",
        key_annotations="cell_type; donor; region; diagnosis",
        citation="Gabitto et al., bioRxiv 2023",
        access_date="2025", data_availability="CELLxGENE public",
    ),
    "HuMicA": dict(
        dataset_name="HuMicA atlas (Seurat)",
        accession_or_source="HuMicA",
        tissue="Human brain", disease_or_condition="As released",
        assay="Seurat (RDS)", organism="Homo sapiens",
        role="Optional reference atlas", aims="NA",
        url="", key_annotations="varies",
        citation="See original publication",
        access_date="2025", data_availability="Publicly available",
    ),
    "GSE144870_Lavaert": dict(
        dataset_name="Human thymus scRNA-seq (10x; flat matrices)",
        accession_or_source="GSE144870",
        tissue="Thymus", disease_or_condition="Normal",
        assay="10x scRNA-seq (mtx/tsv)", organism="Homo sapiens",
        role="Aim1 replication/support", aims="Aim1",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE144870",
        key_annotations="sample; cell_type; donor",
        citation="Lavaert et al., J Exp Med 2020",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE139042_Le": dict(
        dataset_name="Human thymus scRNA-seq (dense tables)",
        accession_or_source="GSE139042",
        tissue="Thymus", disease_or_condition="Normal",
        assay="scRNA-seq (tables)", organism="Homo sapiens",
        role="Aim1 replication/support", aims="Aim1",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE139042",
        key_annotations="sample; cell_type; donor",
        citation="Le et al., Front Immunol 2020",
        access_date="2025", data_availability="GEO public",
    ),
    "GSE133341_Zeng": dict(
        dataset_name="Multi-tissue immune atlas (contains thymus)",
        accession_or_source="GSE133341",
        tissue="Multi-tissue (includes thymus)", disease_or_condition="Normal",
        assay="scRNA-seq (tables)", organism="Homo sapiens",
        role="Aim1 support", aims="Aim1",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE133341",
        key_annotations="tissue; cell_type",
        citation="Zeng et al., 2019",
        access_date="2025", data_availability="GEO public",
    ),

    "GSE184241_innate_memory_LPS": dict(
        dataset_name="Innate memory stimulation (LPS vs RPMI) – MES module profiles",
        accession_or_source="GEO: GSE184241",
        tissue="In vitro immune cells (microglia/macrophage-like)",
        disease_or_condition="Innate memory assay: LPS vs RPMI control",
        assay="Expression matrix (platform per GEO; verify)",
        organism="Human (verify in GEO)",
        role="Supplementary Fig 4F + Supplementary Table 7 (innate-memory MES profiles)",
        aims="Aim2",
        url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE184241",
        key_annotations="Condition encoded in sample IDs: _LPS_ / _RPMI_",
        citation="See GEO entry",
        access_date="2025",
        data_availability="GEO public",
    ),
}


# 6) File manifest

def file_size_human(sz: int) -> str:
    if sz < 1024:        return f"{sz} B"
    if sz < 1024**2:     return f"{sz/1024:.1f} KB"
    if sz < 1024**3:     return f"{sz/1024**2:.1f} MB"
    return f"{sz/1024**3:.2f} GB"

def build_file_manifest() -> pd.DataFrame:
    rows = []
    for ds_id, ds in REG.items():
        for pk in [k for k in ds if k.startswith("path")]:
            p = Path(ds[pk])
            if not p.exists():
                rows.append({"dataset_id": ds_id, "path_key": pk,
                             "file_path": str(p), "exists": False,
                             "is_dir": False, "size_bytes": 0,
                             "size_human": "", "n_files_in_dir": 0})
                continue

            if p.is_file():
                sz = p.stat().st_size
                rows.append({"dataset_id": ds_id, "path_key": pk,
                             "file_path": str(p), "exists": True,
                             "is_dir": False, "size_bytes": int(sz),
                             "size_human": file_size_human(sz),
                             "n_files_in_dir": 0})
            elif p.is_dir():
                files = [f for f in p.rglob("*") if f.is_file()]
                total = sum(f.stat().st_size for f in files)
                rows.append({"dataset_id": ds_id, "path_key": pk,
                             "file_path": str(p), "exists": True,
                             "is_dir": True, "size_bytes": int(total),
                             "size_human": file_size_human(total),
                             "n_files_in_dir": len(files)})
                for f in sorted(files)[:50]:
                    fsz = f.stat().st_size
                    rows.append({"dataset_id": ds_id, "path_key": f"{pk}_child",
                                 "file_path": str(f), "exists": True,
                                 "is_dir": False, "size_bytes": int(fsz),
                                 "size_human": file_size_human(fsz),
                                 "n_files_in_dir": 0})
    return pd.DataFrame(rows)

print("\n[FILE MANIFEST]")
df_manifest = build_file_manifest()
n_files = len(df_manifest[df_manifest["exists"] & ~df_manifest["is_dir"]])
total_gb = df_manifest[~df_manifest["path_key"].str.contains("_child")]["size_bytes"].sum() / 1024**3
print(f"  {n_files} files tracked, {total_gb:.2f} GB total")


# 7) Auto-stats helpers

DONOR_KEYS = ["donor", "donor_id", "donorID", "subject", "subject_id",
              "individual", "individual_id", "participant", "case_id"]
SAMPLE_KEYS = ["sample", "sample_id", "batch", "library_id"]

def safe_h5ad_stats(h5ad_path: Path, max_obs: int = 300000, max_gb: float = 10.0):
    try:
        import anndata as ad
        sz_gb = h5ad_path.stat().st_size / 1024**3
        adata = ad.read_h5ad(h5ad_path, backed="r")
        n_cells, n_genes = int(adata.n_obs), int(adata.n_vars)
        n_donors, donor_key, n_samples = "", "", ""

        if sz_gb <= max_gb and n_cells <= max_obs:
            cols = list(adata.obs.columns)
            hit = next((k for k in DONOR_KEYS if k in cols), None)
            if hit:
                donor_key = hit
                try: n_donors = int(pd.Series(adata.obs[hit]).nunique(dropna=True))
                except: pass
            shit = next((k for k in SAMPLE_KEYS if k in cols), None)
            if shit:
                try: n_samples = int(pd.Series(adata.obs[shit]).nunique(dropna=True))
                except: pass

        return n_cells, n_genes, n_donors, donor_key, n_samples
    except:
        return "", "", "", "", ""

def safe_csv_nrows(path: Path):
    try: return int(pd.read_csv(path).shape[0])
    except: return ""

def count_dir_samples(dir_path: Path, pattern: str = "GSM*") -> int:
    if not dir_path.is_dir(): return 0
    matches = set()
    for f in dir_path.rglob(pattern):
        m = re.match(r"(GSM\d+)", f.name)
        if m: matches.add(m.group(1))
    return len(matches)


# 8) Build Table S1

def build_table_s1():
    rows = []
    for ds_id, ds in REG.items():
        meta = DATASET_META.get(ds_id, {})
        row = {
            "Dataset_ID": ds_id,
            "Dataset (display name)": meta.get("dataset_name", ds_id),
            "Accession / Source": meta.get("accession_or_source", ""),
            "Tissue": meta.get("tissue", ""),
            "Disease / Condition": meta.get("disease_or_condition", ""),
            "Assay / Platform": meta.get("assay", ""),
            "Organism": meta.get("organism", ""),
            "Role in paper": meta.get("role", ""),
            "Aims used": meta.get("aims", ""),
            "Citation": meta.get("citation", ""),
            "Access date": meta.get("access_date", ""),
            "Key annotations (expected)": meta.get("key_annotations", ""),
            "Web link": meta.get("url", ""),
            "Local path(s)": "",
            "N cells/nuclei/spots": "",
            "N genes/features": "",
            "N donors (if available)": "",
            "Donor column used": "",
            "N samples (if available)": "",
            "Data availability": meta.get("data_availability", ""),
            "Notes": ds.get("notes", ""),
        }

        paths = []
        for kk in ["path", "path_counts", "path_cov", "path_anno", "path_sn", "path_vis"]:
            if kk in ds: paths.append(f"{kk}={ds[kk]}")
        row["Local path(s)"] = " | ".join(paths)

        # h5ad auto-stats
        if "path" in ds and str(ds["path"]).lower().endswith(".h5ad"):
            p = Path(ds["path"])
            if p.exists():
                nc, ng, nd, dk, ns = safe_h5ad_stats(p)
                row["N cells/nuclei/spots"] = nc
                row["N genes/features"] = ng
                row["N donors (if available)"] = nd
                row["Donor column used"] = dk
                if ns: row["N samples (if available)"] = ns

        elif ds_id == "GSE138852_AD_snRNA":
            covp = Path(ds["path_cov"])
            if covp.exists():
                row["N cells/nuclei/spots"] = safe_csv_nrows(covp)

        # directory sample counting
        if "path" in ds and Path(ds["path"]).is_dir():
            ns = count_dir_samples(Path(ds["path"]))
            if ns > 0: row["N samples (if available)"] = ns

        rows.append(row)

    df = pd.DataFrame(rows)

    # 
    priority = [
        "TS_Thymus_filtered", "TS_Blood_NegCtrl", "SEAAD_microglia",
        "GSE204702_Tuddenham", "Olah_live_microglia", "GSE233208_paired",
        "GSE219208_stress", "GSE229940_innate_memory",
        "GSE184241_innate_memory_LPS",          
        "GSE138852_AD_snRNA",
        "GSE220442_Visium", "GSE135618_sc_vs_sn", "GSE180759_MS_lesion",
        "TabulaSapiens_full", "SEAAD_MTG_taxonomy", "HuMicA",
        "GSE144870_Lavaert", "GSE139042_Le", "GSE133341_Zeng",
    ]
    df["__order__"] = df["Dataset_ID"].apply(
        lambda x: priority.index(x) if x in priority else 999)
    df = df.sort_values(["__order__", "Dataset_ID"]).drop(columns="__order__")
    return df

print("\n[TABLE S1]")
df_S1 = build_table_s1()


# 9) Dataset overview figure

AIM_MAP = {
    "TS_Thymus_filtered": "Aim1", "GSE144870_Lavaert": "Aim1",
    "GSE139042_Le": "Aim1", "GSE133341_Zeng": "Aim1",
    "TS_Blood_NegCtrl": "NEGCTRL",
    "SEAAD_microglia": "Aim2", "Olah_live_microglia": "Aim2",
    "GSE204702_Tuddenham": "Aim2", "GSE138852_AD_snRNA": "Aim2",
    "GSE184241_innate_memory_LPS": "Aim2",     # ← NEW
    "GSE219208_stress": "Aim3",
    "GSE229940_innate_memory": "Supplement",
    "GSE220442_Visium": "Spatial", "GSE233208_paired": "Spatial",
    "GSE180759_MS_lesion": "Supplement", "GSE135618_sc_vs_sn": "QC",
    "TabulaSapiens_full": "Reference", "SEAAD_MTG_taxonomy": "Reference",
    "HuMicA": "Reference",
}

ROLE_ORDER = ["Aim1", "Aim2", "Aim3", "NEGCTRL", "Spatial", "Supplement", "QC", "Reference"]
ROLE_COLORS = {
    "Aim1": "#1b9e77", "Aim2": "#d95f02", "Aim3": "#7570b3",
    "NEGCTRL": "#e7298a", "Spatial": "#66a61e", "Supplement": "#a6761d",
    "QC": "#999999", "Reference": "#666666",
}

print("\n[FIGURES]")

# Fig S1a: Dataset tile plot
aim_groups = {}
for ds_id, aim in AIM_MAP.items():
    aim_groups.setdefault(aim, []).append(ds_id)

fig = plt.figure(figsize=(10.0, 5.0))
ax = plt.gca()
y_pos = 0
for aim in ROLE_ORDER:
    datasets = aim_groups.get(aim, [])
    if not datasets: continue
    for ds_id in datasets:
        row_s1 = df_S1[df_S1["Dataset_ID"] == ds_id]
        nc = ""
        if len(row_s1):
            v = row_s1.iloc[0]["N cells/nuclei/spots"]
            if v != "" and pd.notna(v):
                try: nc = f"  ({int(v):,} cells)"
                except: nc = f"  ({v})"

        ax.barh(y_pos, 1, color=ROLE_COLORS.get(aim, "#ccc"), edgecolor="white",
                linewidth=0.5, height=0.7)
        ax.text(0.02, y_pos, ds_id + nc, va="center", ha="left", fontsize=6)
        y_pos += 1
    y_pos += 0.5

ax.set_yticks([])
ax.set_xlim(-0.05, 1.05); ax.set_xticks([])
ax.set_title("Dataset registry by analytical aim")
ax.invert_yaxis()
ax.spines["top"].set_visible(False); ax.spines["right"].set_visible(False)
ax.spines["bottom"].set_visible(False)
patches = [mpatches.Patch(color=ROLE_COLORS[r], label=r)
           for r in ROLE_ORDER if r in aim_groups]
ax.legend(handles=patches, loc="lower right", frameon=False, fontsize=6, ncol=2)
plt.tight_layout()
save_fig(fig, "Fig1_DatasetOverview", kind="Supplementary")

# Fig S1b: Inclusion flowchart
fig = plt.figure(figsize=(8.0, 6.0))
ax = plt.gca()
ax.set_xlim(0, 10); ax.set_ylim(0, 10); ax.axis("off")

boxes = [
    (5.0, 9.2, f"Raw Data Registry\n{len(REG)} datasets", "#4292c6"),
    (2.0, 7.3, f"Aim 1: Thymus\n{len(aim_groups.get('Aim1',[]))} datasets", ROLE_COLORS["Aim1"]),
    (5.0, 7.3, f"Aim 2: Microglia\n{len(aim_groups.get('Aim2',[]))} datasets", ROLE_COLORS["Aim2"]),
    (8.0, 7.3, f"Aim 3: Stress\n{len(aim_groups.get('Aim3',[]))} datasets", ROLE_COLORS["Aim3"]),
    (2.0, 5.5, "NB2: QC + Doublet\ndetection + MAD filter", "#bdbdbd"),
    (5.0, 5.5, "NB3: Consensus NMF\nK-sweep + Stability", "#bdbdbd"),
    (8.0, 5.5, "NB4+: Transfer\n+ Validation", "#bdbdbd"),
    (2.0, 3.7, f"NEGCTRL\n{len(aim_groups.get('NEGCTRL',[]))} dataset", ROLE_COLORS["NEGCTRL"]),
    (5.0, 3.7, f"Spatial\n{len(aim_groups.get('Spatial',[]))} datasets", ROLE_COLORS["Spatial"]),
    (8.0, 3.7, f"Supplement + Ref\n{len(aim_groups.get('Supplement',[])) + len(aim_groups.get('QC',[])) + len(aim_groups.get('Reference',[]))} datasets", ROLE_COLORS["Reference"]),
    (2.5, 2.0, "*Zeng GSE133341: excluded (non-standard format);\n3 independent thymus cohorts remain", "#f0f0f0"),
]

for cx, cy, text, color in boxes:
    rect = mpatches.FancyBboxPatch(
        (cx - 1.4, cy - 0.4), 2.8, 0.8,
        boxstyle="round,pad=0.1", facecolor=color, alpha=0.3,
        edgecolor=color, linewidth=1.0)
    ax.add_patch(rect)
    ax.text(cx, cy, text, ha="center", va="center", fontsize=6.5)

# Arrows
for sx, sy, ex, ey in [
    (5.0, 8.8, 2.0, 7.7), (5.0, 8.8, 5.0, 7.7), (5.0, 8.8, 8.0, 7.7),
    (2.0, 6.9, 2.0, 5.9), (5.0, 6.9, 5.0, 5.9), (8.0, 6.9, 8.0, 5.9),
    (2.0, 5.1, 2.0, 4.1), (5.0, 5.1, 5.0, 4.1), (8.0, 5.1, 8.0, 4.1),
]:
    ax.annotate("", xy=(ex, ey), xytext=(sx, sy),
                arrowprops=dict(arrowstyle="->", color="#636363", lw=0.8))

ax.set_title("Study design: dataset inclusion and analytical pipeline")
plt.tight_layout()
save_fig(fig, "Fig1_InclusionFlowchart", kind="Supplementary")


# 10) Integrity checks

print("\n[INTEGRITY CHECKS]")
integrity_rows = []

for k, v in REG.items():
    for kk in ["path", "path_counts", "path_cov", "path_anno", "path_sn", "path_vis"]:
        if kk not in v: continue
        p = Path(v[kk])
        exists = p.exists()
        sz = ""
        if exists:
            if p.is_file(): sz = file_size_human(p.stat().st_size)
            elif p.is_dir(): sz = f"DIR ({sum(1 for _ in p.rglob('*') if _.is_file())} files)"
        integrity_rows.append({
            "dataset_id": k, "path_key": kk, "path": str(p),
            "status": "OK" if exists else "MISSING", "size": sz,
        })
        print(f"  {'[OK] ' if exists else '[FAIL]'} {k}::{kk}")

# h5ad backed-open
try:
    import anndata as ad
    for ds in ["TS_Thymus_filtered", "TS_Blood_NegCtrl", "SEAAD_microglia", "Olah_live_microglia"]:
        if ds not in REG: continue
        p = REG[ds].get("path", "")
        if not p or not Path(p).exists(): continue
        try:
            _ = ad.read_h5ad(p, backed="r")
            integrity_rows.append({"dataset_id": ds, "path_key": "backed_open",
                                   "path": str(p), "status": "OK", "size": ""})
            print(f"  [OK]  h5ad backed-open: {Path(p).name}")
        except Exception as e:
            integrity_rows.append({"dataset_id": ds, "path_key": "backed_open",
                                   "path": str(p), "status": f"FAIL: {e}", "size": ""})
            print(f"  [FAIL] h5ad backed-open: {Path(p).name} -> {e}")
except ImportError:
    print("  [WARN] anndata not available")

df_integrity = pd.DataFrame(integrity_rows)
n_fail = int((df_integrity["status"] != "OK").sum())
if n_fail == 0:
    print("  All checks passed.")
else:
    print(f"  WARNING: {n_fail} checks did not pass.")


# 11) Save Table S1 (same filename, expanded sheets)

out_xlsx = TAB_DIR / "Supplementary_Table1.xlsx"
with pd.ExcelWriter(out_xlsx, engine="openpyxl") as w:
    df_S1.to_excel(w, index=False, sheet_name="TableS1_Datasets")
    df_env.to_excel(w, index=False, sheet_name="Environment")
    df_manifest.to_excel(w, index=False, sheet_name="File_Manifest")
    df_integrity.to_excel(w, index=False, sheet_name="Integrity_Checks")
print(f"[SAVED TABLE] {out_xlsx}")


# 12) Format Table S1

from openpyxl import load_workbook
from openpyxl.styles import Font, Alignment
from openpyxl.utils import get_column_letter

wb = load_workbook(out_xlsx)
for ws_name in wb.sheetnames:
    ws = wb[ws_name]
    header_font = Font(bold=True)
    wrap = Alignment(wrap_text=True, vertical="top")

    for cell in ws[1]:
        cell.font = header_font
        cell.alignment = wrap
    ws.freeze_panes = "A2"
    ws.auto_filter.ref = ws.dimensions

    for row in ws.iter_rows(min_row=2):
        for cell in row:
            cell.alignment = wrap

    for col in range(1, ws.max_column + 1):
        cl = get_column_letter(col)
        mx = max((len(str(c.value or "")) for c in ws[cl]), default=12)
        ws.column_dimensions[cl].width = min(max(12, mx * 0.9), 55)

wb.save(out_xlsx)
print(f"[FORMATTED] {out_xlsx}")


# 13) Final report

print(f"""
NB1 COMPLETE

OUTPUTS:
  Table:
    - {out_xlsx}
      Sheets: TableS1_Datasets, Environment, File_Manifest, Integrity_Checks
  Figures:
    - Supplementary_Fig1_DatasetOverview.png
    - Supplementary_Fig1_InclusionFlowchart.png
""")