In [12]:
################# Stromal Remodeling & Resistance (FFPE rims)

# CONTEXT:
# This notebook implements the SpatialMMKPNN pipeline to analyze stromal remodeling
# and therapy resistance in FFPE tumor slides. It focuses on detecting whether
# stromal signaling axes (e.g. SPP1→ITG, TGFB1→TGFBR, VEGFA→KDR, CXCL12→CXCR4)
# are enriched at the tumor rim or diffuse into the interior. Rim/interior regions
# are defined geometrically, and directed ligand→receptor edges are counted to
# assess spatial redistribution of stromal programs.

# PIPELINE STEPS:
# Cell 0 — Setup: define APP_ROOT, verify data layout, echo paths
# Cell 1 — Preprocessing: load counts, SAFE gene aliasing, attach spatial, CPM→log1p
# Cell 2 — Spatial graph: build kNN (k=8) on pixel coords
# Cell 3 — Region segmentation: rim vs interior via distance-to-edge (EDT)
# Cell 4 — Axis edge calling: directed ligand→receptor edges by region + distance bands
# Cell 5 — WASR analysis: rim enrichment (WASR = obs_rim_share − p0) + bootstrap CI
# Cell 6 — Robustness panel: rim_frac ±25% and CPM thresholds; flag flipped calls
# Cell 7 — Summary: per-sample and per-axis digest (CSV + README text)
# Cell 8 — Minimal plots: WASR±CI and band distributions for README


In [13]:
#### Cell 0 — Setup 

# WHAT: Initialize paths and detect available raw data before running any processing.
# WHY: Ensures the app folder is correctly set up and shows which samples will be processed.
# HOW: Set APP_ROOT, create final_results/, scan data/ for valid Visium (MTX/HDF5) inputs.
# GETS: Printed summary of detected sample folders (counts + spatial flags).


from pathlib import Path
import sys, platform
import pandas as pd
import numpy as np

# >>> Adjust this if your folder name differs <<<
APP_ROOT = Path("/Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims").resolve()

DATA  = APP_ROOT / "data"
CONF  = APP_ROOT / "configs"
FINAL = APP_ROOT / "final_results"
FINAL.mkdir(parents=True, exist_ok=True)

print(f"[INFO] APP_ROOT: {APP_ROOT}")
print(f"[INFO] data/:   {'OK' if DATA.exists() else 'MISSING'}")
print(f"[INFO] configs/:{'OK' if CONF.exists() else 'MISSING'}")
print(f"[INFO] final_results/: created" if FINAL.exists() else "[WARN] could not create final_results/")

print(f"\n[ENV] python: {sys.version.split()[0]} | pandas: {pd.__version__} | numpy: {np.__version__} | OS: {platform.system()}")

# inventory of samples under data/
rows = []
if DATA.exists():
    for p in sorted(DATA.glob("*")):
        if not p.is_dir():
            continue
        sid = p.name
        # detect common inputs
        h5ad   = any(p.glob("*.h5ad"))
        tenxh5 = any(p.glob("*.h5"))  # 10x filtered_feature_bc_matrix.h5
        mtx    = any((p / "filtered_feature_bc_matrix").glob("matrix.mtx*")) or any(p.glob("matrix.mtx*"))
        bar    = any(p.rglob("barcodes.tsv*"))
        feat   = any(p.rglob("features.tsv*")) or any(p.rglob("genes.tsv*"))
        scale  = any(p.rglob("scalefactors_json.json"))
        hires  = any(p.rglob("tissue_hires_image.png"))
        lowres = any(p.rglob("tissue_lowres_image.png"))
        tpos   = any(p.rglob("tissue_positions*"))
        rows.append({
            "sample_id": sid,
            "path": str(p),
            "h5ad": h5ad,
            "10x_h5": tenxh5,
            "mtx": mtx,
            "barcodes": bar,
            "features": feat,
            "scalefactors_json": scale,
            "tissue_hires_png": hires,
            "tissue_lowres_png": lowres,
            "tissue_positions": tpos,
        })

if rows:
    df = pd.DataFrame(rows)
    # shorten path for readability
    df["path"] = df["path"].str.replace(str(APP_ROOT), "...", regex=False)
    print("\n[INFO] Detected sample(s) under data/:")
    display(df)
else:
    print("\n[WARN] No sample folders found under data/. Please place raw inputs in:", DATA)

[INFO] APP_ROOT: /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims
[INFO] data/:   OK
[INFO] configs/:OK
[INFO] final_results/: created

[ENV] python: 3.10.13 | pandas: 1.5.3 | numpy: 1.23.5 | OS: Darwin

[INFO] Detected sample(s) under data/:


Unnamed: 0,sample_id,path,h5ad,10x_h5,mtx,barcodes,features,scalefactors_json,tissue_hires_png,tissue_lowres_png,tissue_positions
0,GSE190811,.../data/GSE190811,False,False,False,True,True,True,True,False,True
1,GSE217414,.../data/GSE217414,False,False,False,True,True,True,True,True,True
2,GSE226997,.../data/GSE226997,False,False,False,True,True,True,True,True,True
3,GSE238004,.../data/GSE238004,False,False,False,True,True,True,True,False,True
4,GSE254364,.../data/GSE254364,False,False,False,True,True,True,True,False,True
5,GSE267401,.../data/GSE267401,False,False,False,True,True,True,True,False,True
6,GSE274103,.../data/GSE274103,False,False,False,True,True,True,True,True,True
7,Human_Breast_Cancer_Block_A_Section_1_2,.../data/Human_Breast_Cancer_Block_A_Section_1_2,False,False,False,True,True,True,True,True,True


In [14]:
#### Cell 1 — Preprocessing: SAFE aliasing + counts & spatial preload
# WHAT:
#   Load each slide’s counts, apply robust gene aliasing (and collapse duplicates), attach spatial
#   coordinates, and normalize to CPM→log1p. Summarize per-sample stats and persist them.
#
# WHY:
#   FFPE cohorts often have messy symbols (aliases, duplicates) and irregular spatial files.
#   This cell guarantees hashable string var names, safe aliasing without “unhashable list” errors,
#   and geometry attach that works even if pixels are missing (we backfill from array indices).
#
# HOW IT GETS (Method):
#   1) Read counts via Scanpy (10x MTX preferred; fallback to filtered_feature_bc_matrix.h5).
#   2) Normalize alias map (YAML) to strings; map symbols; collapse duplicates by summing columns.
#   3) Attach obsm["spatial"] by parsing tissue_positions*.csv/tsv; if pixels absent, synthesize
#      px/py from array_row/array_col using spot_diameter_fullres×0.9 (or 100.0 fallback).
#   4) Normalize to CPM and log1p (sparse-safe).
#
# OUTPUT (What you get):
#   - PREPROC: dict[sample_id] → AnnData (log1p-CPM, spatial in .obsm["spatial"])
#   - final_results/preproc_summary.csv with n_spots, n_genes, has_spatial per sample
#   - Console heartbeat lines per sample (skip reasons if any).


from pathlib import Path
import numpy as np, pandas as pd
import anndata as ad
import scanpy as sc
from scipy import sparse as sp
import json, yaml

assert 'APP_ROOT' in globals(), "Run Cell 0 first to set APP_ROOT."

DATA  = Path(APP_ROOT) / "data"
CONF  = Path(APP_ROOT) / "configs"
FINAL = Path(APP_ROOT) / "final_results"
FINAL.mkdir(parents=True, exist_ok=True)

# ---------- alias map (robust) ----------
def _normalize_alias_map(raw):
    """Coerce any alias values to a single string; ignore weird/empty values."""
    norm = {}
    if not isinstance(raw, dict):
        return norm
    for k, v in raw.items():
        key = str(k)
        val = None
        if isinstance(v, str):
            val = v.strip() or None
        elif isinstance(v, (list, tuple, set)):
            # take first non-empty stringy item
            for item in v:
                if isinstance(item, str) and item.strip():
                    val = item.strip()
                    break
        elif isinstance(v, dict):
            # prefer common fields
            for fld in ("to","symbol","primary","canonical","name"):
                if fld in v and isinstance(v[fld], str) and v[fld].strip():
                    val = v[fld].strip()
                    break
        # fallback: keep original symbol if we couldn't get a clean string
        norm[key] = val if isinstance(val, str) and len(val) else key
    return norm

alias_map = {}
aliases = CONF / "gene_aliases.yaml"
if aliases.exists():
    try:
        alias_map = _normalize_alias_map(yaml.safe_load(aliases.read_text()) or {})
        print(f"[INFO] Alias map entries: {len(alias_map)} (file: {aliases.name})")
    except Exception as e:
        print(f"[WARN] Could not parse {aliases.name}: {e}")

# ---------- I/O helpers ----------
def _find_mtx_dir(sample_dir: Path) -> Path | None:
    for name in ("filtered_feature_bc_matrix", "raw_feature_bc_matrix"):
        p = sample_dir / name
        if p.exists() and list(p.glob("*.mtx*")) and list(p.glob("barcodes.tsv*")) and (list(p.glob("features.tsv*")) or list(p.glob("genes.tsv*"))):
            return p
    for p in sample_dir.rglob("*"):
        if p.is_dir():
            if list(p.glob("*.mtx*")) and list(p.glob("barcodes.tsv*")) and (list(p.glob("features.tsv*")) or list(p.glob("genes.tsv*"))):
                return p
    return None

def _find_h5_file(sample_dir: Path) -> Path | None:
    hits = list(sample_dir.rglob("filtered_feature_bc_matrix.h5"))
    return hits[0] if hits else None

def _read_counts(sample_dir: Path) -> ad.AnnData | None:
    mtx_dir = _find_mtx_dir(sample_dir)
    if mtx_dir is not None:
        try:
            return sc.read_10x_mtx(mtx_dir, var_names="gene_symbols", make_unique=True)
        except Exception as e:
            print(f"[WARN] {sample_dir.name}: read_10x_mtx failed ({e})")
    h5 = _find_h5_file(sample_dir)
    if h5 is not None:
        try:
            return sc.read_10x_h5(str(h5))
        except Exception as e:
            print(f"[WARN] {sample_dir.name}: read_10x_h5 failed ({e})")
    return None

# ---------- gene processing ----------
def _collapse_duplicate_genes(adata: ad.AnnData) -> ad.AnnData:
    """Sum columns for duplicated gene symbols; preserves sparsity."""
    v = pd.Index([str(x) for x in adata.var_names], dtype="object")
    if not pd.Series(v).duplicated().any():
        adata.var_names = v
        return adata
    name_to_idx = {}
    for i, nm in enumerate(v):
        name_to_idx.setdefault(nm, []).append(i)
    X = adata.X
    obs = adata.obs.copy()
    new_data = []
    new_names = []
    if sp.issparse(X):
        X = X.tocsc()
        cols = []
        for nm, idxs in name_to_idx.items():
            if len(idxs) == 1:
                cols.append(X[:, idxs[0]])
            else:
                cols.append(X[:, idxs].sum(axis=1))
            new_names.append(nm)
        X_new = sp.hstack(cols, format="csc").tocsr()
    else:
        cols = []
        for nm, idxs in name_to_idx.items():
            if len(idxs) == 1:
                cols.append(X[:, idxs[0]][:, None])
            else:
                cols.append(X[:, idxs].sum(axis=1, keepdims=True))
            new_names.append(nm)
        X_new = np.hstack(cols)
    adata = ad.AnnData(X_new, obs=obs, var=pd.DataFrame(index=pd.Index(new_names)))
    adata.var_names_make_unique()
    return adata

def _apply_aliases_safe(adata: ad.AnnData, alias: dict) -> ad.AnnData:
    if not alias:
        adata.var_names = pd.Index([str(v) for v in adata.var_names])
        return adata
    # map each var_name through alias (guarantee hashable strings)
    mapped = [alias.get(str(g), str(g)) for g in adata.var_names]
    adata.var_names = pd.Index([str(x) for x in mapped], dtype="object")
    return _collapse_duplicate_genes(adata)

# ---------- spatial attach ----------
def _read_scalefactors(sample_dir: Path) -> dict:
    cands = [sample_dir / "spatial" / "scalefactors_json.json"] + list(sample_dir.rglob("scalefactors_json.json"))
    for js in cands:
        if js.exists():
            try: return json.loads(js.read_text())
            except Exception: pass
    return {}

def _attach_spatial(adata: ad.AnnData, sample_dir: Path) -> bool:
    pos_files = list(sample_dir.glob("spatial/tissue_positions*.csv")) + \
                list(sample_dir.glob("spatial/tissue_positions*.tsv")) + \
                list(sample_dir.rglob("tissue_positions*.csv")) + \
                list(sample_dir.rglob("tissue_positions*.tsv"))
    if not pos_files:
        return False
    pos = pos_files[0]
    # try headerless (Visium v2) first
    try:
        df = pd.read_csv(pos, header=None)
        if df.shape[1] >= 6:
            df.columns = ["barcode","in_tissue","array_row","array_col","pxl_col_in_fullres","pxl_row_in_fullres"] + \
                         [f"extra_{i}" for i in range(df.shape[1]-6)]
        else:
            raise ValueError
    except Exception:
        df = pd.read_csv(pos)
        low = {c.lower(): c for c in df.columns}
        def pick(*names):
            for n in names:
                if n in df.columns: return n
                if n.lower() in low: return low[n.lower()]
            return None
        b = pick("barcode","barcodes","spot_id","ID")
        xpix = pick("pxl_col_in_fullres","pxl_col","pxl_x","x","imagecol","pixel_x")
        ypix = pick("pxl_row_in_fullres","pxl_row","pxl_y","y","imagerow","pixel_y")
        xarr = pick("array_col","spot_col","col","arraycol")
        yarr = pick("array_row","spot_row","row","arrayrow")
        if b:    df = df.rename(columns={b:"barcode"})
        if xpix: df = df.rename(columns={xpix:"pxl_col_in_fullres"})
        if ypix: df = df.rename(columns={ypix:"pxl_row_in_fullres"})
        if xarr: df = df.rename(columns={xarr:"array_col"})
        if yarr: df = df.rename(columns={yarr:"array_row"})
    df["barcode"] = df.get("barcode", pd.Series(range(len(adata.obs_names)))).astype(str)
    df = df.set_index("barcode").reindex(pd.Index(adata.obs_names.astype(str))).reset_index()
    for c in ["in_tissue","array_row","array_col","pxl_col_in_fullres","pxl_row_in_fullres"]:
        if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce")
    have_pix = {"pxl_col_in_fullres","pxl_row_in_fullres"}.issubset(df.columns)
    use_pix = have_pix and np.isfinite(df[["pxl_col_in_fullres","pxl_row_in_fullres"]].to_numpy(dtype=float)).any()
    if not use_pix:
        if not {"array_col","array_row"}.issubset(df.columns):
            return False
        sf = _read_scalefactors(sample_dir)
        pitch = None
        if "spot_diameter_fullres" in sf:
            try: pitch = float(sf["spot_diameter_fullres"]) * 0.9
            except Exception: pitch = None
        if pitch is None: pitch = 100.0
        df["pxl_col_in_fullres"] = pd.to_numeric(df["array_col"], errors="coerce") * pitch
        df["pxl_row_in_fullres"] = pd.to_numeric(df["array_row"], errors="coerce") * pitch
    XY = np.c_[df["pxl_col_in_fullres"].to_numpy(), df["pxl_row_in_fullres"].to_numpy()].astype(float)
    if not np.isfinite(XY).all():
        return False
    adata.obsm["spatial"] = XY
    return True

# ---------- CPM log1p ----------
def _cpm_log1p(adata: ad.AnnData) -> None:
    X = adata.X
    if sp.issparse(X):
        lib = np.asarray(X.sum(axis=1)).ravel(); lib[lib == 0] = 1.0
        inv = sp.diags(1.0 / lib); X = inv @ X; X = X * 1e6; X.data = np.log1p(X.data); adata.X = X.tocsr()
    else:
        lib = X.sum(axis=1, keepdims=True); lib[lib == 0] = 1.0; adata.X = np.log1p((X / lib) * 1e6)

# ---------- main ----------
PREPROC = {}
rows = []
for sample_dir in sorted(p for p in DATA.glob("*") if p.is_dir()):
    sid = sample_dir.name
    adata = _read_counts(sample_dir)
    if adata is None:
        print(f"[SKIP] {sid}: no 10x matrix (.mtx or .h5) found")
        continue
    # enforce string gene names, SAFE aliasing, attach spatial, CPM log1p
    adata.var_names = pd.Index([str(v) for v in adata.var_names], dtype="object")
    adata = _apply_aliases_safe(adata, alias_map)
    has_spatial = _attach_spatial(adata, sample_dir)
    _cpm_log1p(adata)
    PREPROC[sid] = adata
    rows.append({"sample_id": sid, "n_spots": int(adata.n_obs), "n_genes": int(adata.n_vars), "has_spatial": bool(has_spatial)})

# summary
if rows:
    df = pd.DataFrame(rows).sort_values("sample_id")
    df.to_csv(FINAL / "preproc_summary.csv", index=False)
    print("[OK] Preprocessing complete.")
    df
else:
    print("[WARN] No samples loaded. Ensure each data/<sample_id>/ has either a 10x MTX folder or filtered_feature_bc_matrix.h5.")


[INFO] Alias map entries: 26 (file: gene_aliases.yaml)
[WARN] GSE274103: read_10x_mtx failed (Did not find file /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data/GSE274103/GSM8443453/filtered_feature_bc_matrix/matrix.mtx.gz.)


  utils.warn_names_duplicates("var")


[OK] Preprocessing complete.


In [15]:
# CELL 2 — Build the spatial graph (k-NN on Visium pixel space)
# why:
#   Downstream signaling edges (ligand→receptor) are evaluated on a spatial graph of neighboring spots.
#   We use a simple, transparent graph: k-nearest neighbors in the full-resolution pixel coordinates.
#
# how:
#   - For each sample in PREPROC, read adata.obsm["spatial"] (x,y) in pixel units.
#   - Drop any spots with non-finite coords; de-duplicate identical coords if any.
#   - Build an undirected k-NN graph (default k=8) with Euclidean distance.
#   - Emit a tidy DataFrame per sample with columns:  i, j, dist_pixels  (i<j to avoid duplicates).
#
# what it gets:
#   - GRAPH: dict[sample_id] = DataFrame of edges (i, j, dist_pixels) in the *filtered* spot index space.
#   - final_results/graph_summary.csv with per-sample counts (spots, edges, avg_degree).
#
# notes:
#   - k=8 matches typical Visium neighborhood density; adjust K if your slides are sparser/denser.
#   - This graph is geometry-only. Role maps / axes come later, built on top of this adjacency.

from pathlib import Path
import numpy as np, pandas as pd
from sklearn.neighbors import NearestNeighbors

assert 'PREPROC' in globals(), "Run Cell 1 first to build PREPROC."
FINAL = Path(APP_ROOT) / "final_results"
FINAL.mkdir(parents=True, exist_ok=True)

K = 8  # neighborhood size

GRAPH = {}
rows = []

for sid, adata in sorted(PREPROC.items()):
    XY = adata.obsm.get("spatial", None)
    if XY is None or not isinstance(XY, np.ndarray):
        print(f"[SKIP] {sid}: no spatial coords")
        continue

    # keep only finite coords
    finite = np.isfinite(XY).all(axis=1)
    if not finite.any():
        print(f"[SKIP] {sid}: spatial coords not finite")
        continue
    XYf = XY[finite]

    # optional: collapse exact duplicate coordinates (rare in some exports)
    # map from original filtered index -> compact index
    uniq, inv = np.unique(XYf, axis=0, return_inverse=True)
    XYu = uniq
    # if many collapses happened, K may be too large; guard:
    k_eff = min(K, max(1, XYu.shape[0] - 1))

    # fit kNN
    nbrs = NearestNeighbors(n_neighbors=k_eff + 1, algorithm="auto", metric="euclidean")
    nbrs.fit(XYu)
    dists, idxs = nbrs.kneighbors(XYu, return_distance=True)  # includes self at col 0

    # build undirected edge list on the *unique* nodes, then expand back to filtered indices
    ii, jj, dd = [], [], []
    for u in range(XYu.shape[0]):
        for col in range(1, idxs.shape[1]):  # skip self
            v = int(idxs[u, col])
            if u < v:  # undirected, keep one direction
                ii.append(u); jj.append(v); dd.append(float(dists[u, col]))
    edges_u = pd.DataFrame({"i_u": ii, "j_u": jj, "dist_pixels": dd})

    # expand from unique nodes (u) back to filtered indices (f) using inverse map
    # strategy: for each unique pair (u,v), connect all (f in inv==u) to all (g in inv==v)
    # In practice duplicates are rare, so this usually leaves i<j pairs 1:1.
    inv_series = pd.Series(inv)
    exp_i, exp_j, exp_d = [], [], []
    for u, v, d in edges_u.itertuples(index=False):
        f_i = np.where(inv == u)[0]
        f_j = np.where(inv == v)[0]
        # all combinations; enforce i<j in filtered index space
        for a in f_i:
            for b in f_j:
                i_f, j_f = (int(a), int(b)) if a < b else (int(b), int(a))
                exp_i.append(i_f); exp_j.append(j_f); exp_d.append(d)

    edges = pd.DataFrame({"i": exp_i, "j": exp_j, "dist_pixels": exp_d})
    edges = edges.drop_duplicates(subset=["i", "j"]).reset_index(drop=True)

    # stash and summarize (all indices are in the filtered set; keep a map if needed later)
    GRAPH[sid] = edges

    n_nodes = XYf.shape[0]
    n_edges = edges.shape[0]
    avg_deg = (2.0 * n_edges) / max(1, n_nodes)
    rows.append({"sample_id": sid, "spots_used": n_nodes, "k": K, "undirected_edges": n_edges, "avg_degree": round(avg_deg, 3)})

# summary table
summary = pd.DataFrame(rows).sort_values("sample_id")
summary.to_csv(FINAL / "graph_summary.csv", index=False)
print("[OK] Graphs built.")
summary


[OK] Graphs built.


Unnamed: 0,sample_id,spots_used,k,undirected_edges,avg_degree
0,GSE190811,4992,8,19803,7.934
1,GSE217414,2887,8,11496,7.964
2,GSE226997,4148,8,16619,8.013
3,GSE238004,4991,8,19875,7.964
4,GSE254364,3070,8,12282,8.001
5,GSE267401,4992,8,19866,7.959
6,GSE274103,4952,8,19775,7.987
7,Human_Breast_Cancer_Block_A_Section_1_2,3798,8,15250,8.031


In [17]:

# Cell 3 — Rim vs interior via distance-to-edge (robust)
# WHAT:
# Segment each slide into rim vs interior using a smoothed tissue mask and an inside-distance threshold.
#
# WHY:
# Morphological erosion on sparse FFPE grids can mislabel everything as rim. A distance-to-edge rule is
# stable and intuitive: spots close to the boundary (within a fraction of max inside distance) are “rim”.
#
# HOW IT GETS:
# 1) Estimate spot spacing from kNN and rasterize spots to a grid with step ≈ spacing/2.
# 2) Smooth the binary mask (dilate + close) to form contiguous tissue.
# 3) Compute inside distance transform; classify rim = dist ≤ RIM_FRAC × max_dist.
#    Guardrails: shrink threshold if “all rim”; worst case, mark all interior.
#
# WHAT IT GETS:
# - REGION[sid]: DataFrame with x,y and boolean rim/interior per spot (complementary).
# - final_results/region_summary.csv: per-sample counts and rim_frac.


from pathlib import Path
import numpy as np, pandas as pd
from scipy import ndimage as ndi
from sklearn.neighbors import NearestNeighbors

assert 'PREPROC' in globals(), "Run Cell 1 first."
FINAL = Path(APP_ROOT) / "final_results"; FINAL.mkdir(parents=True, exist_ok=True)

RIM_FRAC = 0.18   # ~outer 18% band counted as rim (adjustable)
SMOOTH_IT = 2     # smoothing strength for mask (dilation/closing iterations)

def _grid_params_from_xy(XY):
    # Estimate typical spacing from kNN and choose a grid step ~ spacing / 2 so neighbors touch
    k = min(6, XY.shape[0]-1)
    nbrs = NearestNeighbors(n_neighbors=k+1, algorithm="auto").fit(XY)
    dists, _ = nbrs.kneighbors(XY)
    # ignore self at col 0
    med_nn = np.median(dists[:, 1:].ravel())
    step = max(1.0, med_nn / 2.0)
    return step

def _grid_from_xy(XY, step, margin=4):
    x = XY[:,0]; y = XY[:,1]
    xi = np.floor((x - x.min()) / step).astype(int) + margin
    yi = np.floor((y - y.min()) / step).astype(int) + margin
    W = int(xi.max()) + margin + 1
    H = int(yi.max()) + margin + 1
    grid = np.zeros((H, W), dtype=bool)
    grid[yi, xi] = True
    return grid, xi, yi

REGION = {}
rows = []

for sid, adata in sorted(PREPROC.items()):
    XY = adata.obsm.get("spatial")
    if XY is None or not np.isfinite(XY).all():
        print(f"[SKIP] {sid}: missing/invalid spatial coords")
        continue

    # 1) grid & mask
    step = _grid_params_from_xy(XY)
    mask, xi, yi = _grid_from_xy(XY, step)

    # 2) smooth to contiguous tissue (dilate + close)
    struc = ndi.generate_binary_structure(2, 2)  # 8-connectivity
    mask_s = ndi.binary_dilation(mask, structure=struc, iterations=SMOOTH_IT)
    mask_s = ndi.binary_closing(mask_s, structure=struc, iterations=SMOOTH_IT)

    if not mask_s.any():
        # fallback: no smoothing effect; mark all as interior to avoid all-rim artifact
        rim_mask = np.zeros_like(xi, dtype=bool)
        interior_mask = np.ones_like(xi, dtype=bool)
    else:
        # 3) inside distance transform and rim threshold
        dist_in = ndi.distance_transform_edt(mask_s)
        dmax = float(dist_in.max())
        # threshold in grid cells
        thr = max(1.0, RIM_FRAC * dmax)

        # classify grid cells
        rim_cells = (mask_s & (dist_in <= thr))
        interior_cells = (mask_s & (dist_in >  thr))

        # 4) map back to spots
        rim_mask = rim_cells[yi, xi]
        interior_mask = interior_cells[yi, xi]

        # guard: if degenerate (all rim), relax threshold
        if rim_mask.all():
            thr = max(1.0, 0.10 * dmax)  # shrink rim band
            rim_cells = (mask_s & (dist_in <= thr))
            interior_cells = (mask_s & (dist_in >  thr))
            rim_mask = rim_cells[yi, xi]
            interior_mask = interior_cells[yi, xi]
        if (~rim_mask).sum() == 0:  # still all rim → force smallest band
            rim_mask[:] = False
            interior_mask[:] = True

    df = pd.DataFrame({
        "x": XY[:,0],
        "y": XY[:,1],
        "rim": rim_mask.astype(bool),
        "interior": interior_mask.astype(bool),
    })
    REGION[sid] = df

    n_spots = df.shape[0]
    n_rim = int(df["rim"].sum())
    n_int = int(df["interior"].sum())
    rows.append({
        "sample_id": sid,
        "spots": n_spots,
        "rim": n_rim,
        "interior": n_int,
        "rim_frac": round(n_rim / max(1, n_spots), 3)
    })

region_summary = pd.DataFrame(rows).sort_values("sample_id")
region_summary.to_csv(FINAL / "region_summary.csv", index=False)
print(f"[OK] Rim/interior via distance-to-edge | RIM_FRAC={RIM_FRAC}, SMOOTH_IT={SMOOTH_IT}")
region_summary


[OK] Rim/interior via distance-to-edge | RIM_FRAC=0.18, SMOOTH_IT=2


Unnamed: 0,sample_id,spots,rim,interior,rim_frac
0,GSE190811,4992,1359,3633,0.272
1,GSE217414,2887,710,2177,0.246
2,GSE226997,4148,1184,2964,0.285
3,GSE238004,4991,1329,3662,0.266
4,GSE254364,3070,868,2202,0.283
5,GSE267401,4992,1390,3602,0.278
6,GSE274103,4952,1285,3667,0.259
7,Human_Breast_Cancer_Block_A_Section_1_2,3798,923,2875,0.243


In [None]:
## Interpretation (Cell 3 — rim vs interior):

## Using distance-to-edge with smoothing (RIM_FRAC=0.18, SMOOTH_IT=2), rim segmentation behaved as expected across slides.
## Rim fractions are tight (~0.24–0.29) for all eight samples, avoiding the “all-rim” failure mode and indicating a consistent boundary thickness.
## Spot counts per sample look healthy (2.9k–5k+), so rim/interior strata are well-populated for downstream edge calling and WASR.
## This confirms the geometry step is stable: rims capture the immediate boundary zone while preserving ample interior for comparison.
## If any slide later shows odd WASR behavior, rim_frac here provides the geometric baseline p0 to audit against.

In [2]:
#### Cell 4 — Axes & thresholds; edge calling; region + band counts (final FFPE rims cohort, dup-gene fix)
# WHAT:
# Build geometry-only kNN (k=8), call directed ligand→receptor (LR) edges at CPM≥1 (log1p threshold),
# and tabulate totals by rim vs interior and by normalized distance-to-edge bands (B1..B5).
#
# WHY:
# This makes the LR signal explicit and auditable (no hidden loaders), aligns rim/bands with Cell 3’s geometry,
# and fixes duplicate gene names up front to avoid ambiguous L/R lookups.
#
# HOW IT GETS:
# 1) Lock the cohort to the 8 validated FFPE slides (whitelist) for reproducibility.
# 2) Ensure unique gene symbols (`var_names_make_unique()`), then threshold on log1p(CPM) ≥ log1p(1).
# 3) kNN on pixel coords; form undirected edges (i<j); expand to directed LR edges (A→B, B→A) when ligand(source) & receptor(target).
# 4) Recompute inside distance (same raster/smoothing as Cell 3) to assign each source spot a normalized band (B1..B5).
# 5) Emit `axis_edge_counts.csv` (totals, rim, interior) and `axis_bands.csv` (band counts & shares).
#
# OUTPUT:
# - final_results/axis_edge_counts.csv  (sample_id, axis, total, rim, interior)
# - final_results/axis_bands.csv        (sample_id, axis, band, count, share)

from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy import ndimage as ndi
import json, re

# --- Paths (edit PROJECT_ROOT/DATA_DIR if your layout differs) ---
PROJECT_ROOT = Path("/Users/sally/Desktop/SpatialMMKPNN-Apps2").resolve()
APP_ROOT     = PROJECT_ROOT / "applications" / "02_Stromal_Remodeling_FFPE_Rims"
CONFIGS_DIR  = PROJECT_ROOT / "configs"
FINAL_DIR    = PROJECT_ROOT / "final_results"
FINAL_DIR.mkdir(parents=True, exist_ok=True)

# 🔒 FFPE rims final cohort (whitelist)
SAMPLE_WHITELIST = [
    "GSE190811",
    "GSE217414",
    "GSE226997",
    "GSE238004",
    "GSE254364",
    "GSE267401",
    "GSE274103",
    "Human_Breast_Cancer_Block_A_Section_1_2",
]

# Your external data root for this app:
DATA_DIR = Path("/Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data").resolve()

# --- Parameters ---
KNN_K      = 8
CPM_THRES  = 1.0
RIM_FRAC   = 0.18
SMOOTH_IT  = 2
BANDS      = [(0.00, 0.10), (0.10, 0.25), (0.25, 0.50), (0.50, 0.75), (0.75, 1.00)]

# Axes (receptor prefixes allowed)
AXES = [
    {"name": "VEGFA_to_KDR",         "lig": ["VEGFA"],  "rec_prefix": ["KDR"]},
    {"name": "CXCL12_to_CXCR4",      "lig": ["CXCL12"], "rec_prefix": ["CXCR4"]},
    {"name": "TGFB1_to_TGFBR_fam",   "lig": ["TGFB1"],  "rec_prefix": ["TGFBR"]},  # TGFBR1/2/…
    {"name": "SPP1_to_ITG_fam",      "lig": ["SPP1"],   "rec_prefix": ["ITG"]},    # ITGA*/ITGB*
]

# ---------- IO helpers ----------
def _find_positions(sample_dir: Path) -> Path:
    poss = sorted([p for p in sample_dir.rglob("tissue_positions*") if p.suffix.lower() in (".csv",".tsv")],
                  key=lambda p: (0 if "spatial" in p.as_posix() else 1, len(p.as_posix())))
    if not poss:
        raise FileNotFoundError(f"No tissue_positions*.csv/tsv under {sample_dir}")
    return poss[0]

def _load_spatial_and_scale(sample_dir: Path) -> pd.DataFrame:
    posf = _find_positions(sample_dir)
    # headerless 6-col first
    try:
        df = pd.read_csv(posf, header=None)
        if df.shape[1] >= 6:
            df = df.iloc[:, :6]
            df.columns = ["barcode","in_tissue","arr_row","arr_col","px","py"]
        else:
            raise ValueError
    except Exception:
        df = pd.read_csv(posf, sep=None, engine="python")
        mapping = {}
        for c in df.columns:
            lc = c.lower()
            if "barcode" in lc: mapping[c] = "barcode"
            elif "in_tissue" in lc: mapping[c] = "in_tissue"
            elif "row" in lc: mapping[c] = "arr_row"
            elif "col" in lc: mapping[c] = "arr_col"
            elif lc in ("x","px"): mapping[c] = "px"
            elif lc in ("y","py"): mapping[c] = "py"
        df = df.rename(columns=mapping)
        for need in ["barcode","in_tissue","arr_row","arr_col","px","py"]:
            if need not in df.columns: df[need] = np.nan
        df = df[["barcode","in_tissue","arr_row","arr_col","px","py"]]

    # scale factor (optional)
    scale = 100.0
    sf_cands = list(sample_dir.rglob("scalefactors_json.json"))
    if sf_cands:
        try:
            sf = json.loads(sf_cands[0].read_text())
            if "spot_diameter_fullres" in sf and np.isfinite(sf["spot_diameter_fullres"]):
                scale = float(sf["spot_diameter_fullres"]) * 0.9
        except Exception:
            pass

    # synthesize pixel coords if missing
    if df["px"].isna().any() or df["py"].isna().any():
        for c in ["arr_row","arr_col"]:
            if not np.issubdtype(df[c].dtype, np.number):
                df[c] = pd.to_numeric(df[c], errors="coerce")
        df["px"] = df["arr_col"].astype(float) * scale
        df["py"] = df["arr_row"].astype(float) * scale

    # keep in-tissue if flag present
    if df["in_tissue"].notna().any():
        df = df[df["in_tissue"] == 1].copy()

    return df[["barcode","px","py","arr_row","arr_col"]]

def _read_counts(sample_dir: Path):
    import scanpy as sc

    # Prefer canonical MTX dirs (any depth), else 10x H5, else any matrix.mtx*
    mtx_dir = None
    for cand in ["filtered_feature_bc_matrix","raw_feature_bc_matrix"]:
        hit = list(sample_dir.rglob(f"{cand}/matrix.mtx*"))
        if hit:
            mtx_dir = hit[0].parent
            break
    if mtx_dir is not None:
        adata = sc.read_10x_mtx(mtx_dir, var_names="gene_symbols", make_unique=False)
    else:
        h5_hits = sorted([p for p in sample_dir.rglob("*.h5")])
        if h5_hits:
            adata = sc.read_10x_h5(h5_hits[0])
        else:
            any_mtx = sorted([p for p in sample_dir.rglob("matrix.mtx*")])
            if not any_mtx:
                raise FileNotFoundError("No 10x matrix found (MTX or H5).")
            adata = sc.read_10x_mtx(any_mtx[0].parent, var_names="gene_symbols", make_unique=False)

    # ✅ Ensure unique gene names to avoid ambiguous lookups
    adata.var_names_make_unique()
    adata.var_names = adata.var_names.astype(str)
    adata.obs_names = adata.obs_names.astype(str)

    # Recover CPM robustly
    if "counts" in adata.layers:
        counts = adata.layers["counts"]
        if hasattr(counts, "toarray"): counts = counts.toarray()
        counts = counts.astype(np.float64)
        lib = counts.sum(axis=1, keepdims=True); lib[lib==0] = 1.0
        X_cpm = (counts / lib) * 1e6
    else:
        X = adata.X
        if hasattr(X, "toarray"): X = X.toarray()
        X = X.astype(np.float64)
        mx = np.nanmax(X) if X.size else 0.0
        if mx > 50:  # likely raw counts
            lib = X.sum(axis=1, keepdims=True); lib[lib==0] = 1.0
            X_cpm = (X / lib) * 1e6
        else:
            X_cpm = np.expm1(X)  # assume log1p(CPM)

    return list(adata.obs_names), list(adata.var_names), X_cpm.astype(np.float32)

# ---------- Geometry & rims ----------
def _knn_edges(px, py, k=8):
    coords = np.c_[px, py]
    nbrs = NearestNeighbors(n_neighbors=min(k+1, len(coords))).fit(coords)
    neigh = nbrs.kneighbors(return_distance=False)
    rows, cols = [], []
    for i in range(neigh.shape[0]):
        for j in neigh[i, 1:]:
            a, b = (i, j) if i < j else (j, i)
            rows.append(a); cols.append(b)
    return np.unique(np.c_[rows, cols], axis=0)

def _inside_distance_raster(px, py, smooth_it=2):
    coords = np.c_[px, py]
    if len(coords) < 2:
        return np.zeros(len(coords))
    # grid step ~ half of median 1-NN distance
    nbrs = NearestNeighbors(n_neighbors=2).fit(coords)
    dists, _ = nbrs.kneighbors(return_distance=True)
    step = np.median(dists[:,1]) / 2.0
    if not np.isfinite(step) or step <= 0:
        step = 50.0
    xmin, ymin = coords.min(axis=0); xmax, ymax = coords.max(axis=0)
    nx = int(np.ceil((xmax - xmin) / step)) + 6
    ny = int(np.ceil((ymax - ymin) / step)) + 6
    gx = np.clip(((px - xmin) / step).astype(int) + 3, 0, nx-1)
    gy = np.clip(((py - ymin) / step).astype(int) + 3, 0, ny-1)
    grid = np.zeros((ny, nx), dtype=bool); grid[gy, gx] = True
    grid = ndi.binary_dilation(grid, iterations=smooth_it)
    grid = ndi.binary_closing(grid, iterations=smooth_it)
    dist_in = ndi.distance_transform_edt(grid)
    inside = dist_in[gy, gx].astype(float)
    m = inside.max() if inside.size else 0.0
    return inside / m if m > 0 else np.zeros_like(inside)

def _expand_prefixes(gene_list, prefixes):
    out = set()
    for p in prefixes:
        p = str(p)
        out.update([g for g in gene_list if g.startswith(p)])
    return sorted(out)

# ---------- Per-sample pipeline ----------
def _axis_edges_for_sample(sample_id, sample_dir: Path):
    # positions + counts
    df_pos = _load_spatial_and_scale(sample_dir)
    barcodes, genes, X_cpm = _read_counts(sample_dir)

    # align order
    pos = df_pos.set_index("barcode")
    common = pos.index.intersection(pd.Index(barcodes))
    if len(common) == 0:
        raise ValueError(f"No overlapping barcodes for {sample_id}")
    b2i = {b:i for i,b in enumerate(barcodes)}
    idx = np.array([b2i[b] for b in common], dtype=int)
    pos = pos.loc[common].copy()
    X = X_cpm[idx, :]
    g2i = {g:i for i,g in enumerate(genes)}

    # kNN graph
    edges_und = _knn_edges(pos["px"].values, pos["py"].values, k=KNN_K)

    # Inside distance via rasterization (Cell 3 style)
    inside_d = _inside_distance_raster(pos["px"].values, pos["py"].values, smooth_it=SMOOTH_IT)
    rim_mask = inside_d <= (RIM_FRAC * (inside_d.max() if inside_d.size else 1.0))
    region = np.where(rim_mask, "rim", "interior")

    # Directed edges
    directed = np.vstack([np.c_[edges_und[:,0], edges_und[:,1]],
                          np.c_[edges_und[:,1], edges_und[:,0]]])

    # distance bands by source
    def _band_idx(val):
        for bi,(lo,hi) in enumerate(BANDS, start=1):
            if lo <= val < hi or (hi==1.0 and np.isclose(val,1.0)):
                return f"B{bi}:{int(lo*100)}-{int(hi*100)}%"
        return "BNA"
    source_band = np.array([_band_idx(v) for v in inside_d])

    rows_counts, rows_bands = [], []

    for axis in AXES:
        name = axis["name"]
        ligs = [g for g in axis["lig"] if g in g2i]
        recs = [r for r in _expand_prefixes(genes, axis["rec_prefix"]) if r in g2i]
        if len(ligs)==0 or len(recs)==0:
            rows_counts.append({"sample_id": sample_id,"axis": name,"total":0,"rim":0,"interior":0})
            for b in sorted(set(source_band)):
                rows_bands.append({"sample_id":sample_id,"axis":name,"band":b,"count":0,"share":0.0})
            continue

        lig_mat = np.max(np.column_stack([X[:, g2i[g]] for g in ligs]), axis=1) if len(ligs)>1 else X[:, g2i[ligs[0]]]
        rec_mat = np.max(np.column_stack([X[:, g2i[r]] for r in recs]), axis=1) if len(recs)>1 else X[:, g2i[recs[0]]]
        src_ok, tgt_ok = lig_mat>=CPM_THRES, rec_mat>=CPM_THRES

        src_idx, tgt_idx = directed[:,0], directed[:,1]
        keep = src_ok[src_idx] & tgt_ok[tgt_idx]
        called = directed[keep]

        if called.shape[0]==0:
            rows_counts.append({"sample_id": sample_id,"axis": name,"total":0,"rim":0,"interior":0})
            for b in sorted(set(source_band)):
                rows_bands.append({"sample_id":sample_id,"axis":name,"band":b,"count":0,"share":0.0})
        else:
            src_regions = region[called[:,0]]
            n_total = called.shape[0]; n_rim = int((src_regions=="rim").sum())
            n_int = n_total - n_rim
            rows_counts.append({"sample_id": sample_id,"axis": name,
                                "total":int(n_total),"rim":int(n_rim),"interior":int(n_int)})
            bands_src = source_band[called[:,0]]
            band_counts = pd.Series(bands_src).value_counts().sort_index()
            for b,c in band_counts.items():
                rows_bands.append({"sample_id":sample_id,"axis":name,"band":b,"count":int(c),"share":float(c/n_total)})

    return pd.DataFrame(rows_counts), pd.DataFrame(rows_bands)

# ---------- Run (whitelist enforced) ----------
print(f"[PATH ECHO] DATA_DIR = {DATA_DIR}")
present = {p.name for p in DATA_DIR.iterdir() if p.is_dir()}
samples = [s for s in SAMPLE_WHITELIST if s in present]
missing  = [s for s in SAMPLE_WHITELIST if s not in present]
print("[DISCOVERY] Using whitelist:", samples)
if missing:
    print("[WARN] Missing from DATA_DIR:", missing)

all_counts, all_bands = [], []
for sn in samples:
    sdir = DATA_DIR / sn
    try:
        dfc, dfb = _axis_edges_for_sample(sn, sdir)
        all_counts.append(dfc); all_bands.append(dfb)
        print(f"[OK] Axis edges called — {sn}")
    except Exception as e:
        print(f"[WARN] Skipped {sn}: {e}")

df_counts = pd.concat(all_counts, ignore_index=True) if all_counts else pd.DataFrame(
    columns=["sample_id","axis","total","rim","interior"])
df_bands  = pd.concat(all_bands,  ignore_index=True) if all_bands  else pd.DataFrame(
    columns=["sample_id","axis","band","count","share"])

df_counts.to_csv(FINAL_DIR / "axis_edge_counts.csv", index=False)
df_bands.to_csv(FINAL_DIR  / "axis_bands.csv", index=False)

print("\nAxis edge counts (head):")
display(df_counts.head(12))
print("\nAxis bands (head):")
display(df_bands.head(12))


[PATH ECHO] DATA_DIR = /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data
[DISCOVERY] Using whitelist: ['GSE190811', 'GSE217414', 'GSE226997', 'GSE238004', 'GSE254364', 'GSE267401', 'GSE274103', 'Human_Breast_Cancer_Block_A_Section_1_2']


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE190811


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE217414


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE226997


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE238004


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE254364


  utils.warn_names_duplicates("var")


[OK] Axis edges called — GSE267401
[WARN] Skipped GSE274103: Did not find file /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data/GSE274103/GSM8443453/filtered_feature_bc_matrix/matrix.mtx.gz.


  utils.warn_names_duplicates("var")


[OK] Axis edges called — Human_Breast_Cancer_Block_A_Section_1_2

Axis edge counts (head):


Unnamed: 0,sample_id,axis,total,rim,interior
0,GSE190811,VEGFA_to_KDR,15802,3577,12225
1,GSE190811,CXCL12_to_CXCR4,19437,4703,14734
2,GSE190811,TGFB1_to_TGFBR_fam,26566,5800,20766
3,GSE190811,SPP1_to_ITG_fam,21789,5164,16625
4,GSE217414,VEGFA_to_KDR,294,51,243
5,GSE217414,CXCL12_to_CXCR4,164,73,91
6,GSE217414,TGFB1_to_TGFBR_fam,2264,470,1794
7,GSE217414,SPP1_to_ITG_fam,2222,439,1783
8,GSE226997,VEGFA_to_KDR,1947,473,1474
9,GSE226997,CXCL12_to_CXCR4,3501,981,2520



Axis bands (head):


Unnamed: 0,sample_id,axis,band,count,share
0,GSE190811,VEGFA_to_KDR,B1:0-10%,1650,0.104417
1,GSE190811,VEGFA_to_KDR,B2:10-25%,3709,0.234717
2,GSE190811,VEGFA_to_KDR,B3:25-50%,5425,0.343311
3,GSE190811,VEGFA_to_KDR,B4:50-75%,3662,0.231743
4,GSE190811,VEGFA_to_KDR,B5:75-100%,1356,0.085812
5,GSE190811,CXCL12_to_CXCR4,B1:0-10%,2067,0.106344
6,GSE190811,CXCL12_to_CXCR4,B2:10-25%,5350,0.275248
7,GSE190811,CXCL12_to_CXCR4,B3:25-50%,7162,0.368473
8,GSE190811,CXCL12_to_CXCR4,B4:50-75%,3748,0.192828
9,GSE190811,CXCL12_to_CXCR4,B5:75-100%,1110,0.057108


In [None]:
## Interpretation (Cell 4 — axis edges & bands):
## Directed ligand→receptor edges were successfully called over the kNN geometry (k=8) using CPM≥1 for both source and target.
## Gene name de-duplication and receptor-prefix expansion (e.g., TGFBR*, ITG*) worked as intended, preventing lookup ambiguity.
## Edge totals are substantial for most slides, enabling precise rim/interior estimates; very low-n cases (e.g., tiny VEGFA→KDR in some slides)
## are correctly reflected and should be treated as tentative in Cell 5. Distance-band counts distribute plausibly from boundary (B1) inward (B5),
## indicating that banding is coherent with the rim geometry from Cell 3. Note: GSE274103 was skipped due to missing 10x matrix; all outputs
## (axis_edge_counts.csv, axis_bands.csv) reflect only the successfully processed samples.


In [4]:
#### Cell 5 — WASR & Δedges with bootstrap CI + Bernoulli null (robust p0 discovery)

# WHAT:
# Compute rim enrichment per axis using WASR = observed rim share − p0 (slide-specific rim fraction),
# report Δedges (rim−expected), and estimate uncertainty via bootstrap CIs with a Bernoulli/shuffle null.
#
# WHY:
# A slide’s geometry sets a baseline rim fraction (p0). Comparing to p0 controls for tissue shape,
# so positives reflect true boundary redistribution, not just larger rims. Bootstrap CIs make calls auditable.
#
# HOW IT GETS:
# 1) Load axis_edge_counts.csv (Cell 4) and discover p0 from region_summary.csv (Cell 3), searching common paths.
# 2) For each sample×axis: compute obs_rim_share = n_rim / n_total and WASR = obs_rim_share − p0.
# 3) Bootstrap edges (or spots) to get 95% CIs; build a label-shuffle/Bernoulli null for sanity checks.
# 4) Guard small-n: if n_total=0, emit NA; if counts tiny, widen CIs and flag as tentative in the narrative.
#
# WHAT IT GETS:
# - final_results/rim_enrichment.csv with columns:
#   sample_id, axis, n_total, n_rim, n_interior, p0, obs_rim_share, WASR, dEdges, CI_low, CI_high, null_mean, null_std
# - Console summary of per-sample calls (+ if CI_low>0, − if CI_high<0, 0 otherwise), using slide-specific p0.


from pathlib import Path
import pandas as pd
import numpy as np

# --- Known roots (edit if your layout differs) ---
PROJECT_ROOT = Path("/Users/sally/Desktop/SpatialMMKPNN-Apps2").resolve()
APP_ROOT     = PROJECT_ROOT / "applications" / "02_Stromal_Remodeling_FFPE_Rims"
EXTERNAL_APP = Path("/Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims").resolve()  # where your data lives

FINAL_DIR    = PROJECT_ROOT / "final_results"
COUNTS_CSV   = FINAL_DIR / "axis_edge_counts.csv"
BANDS_CSV    = FINAL_DIR / "axis_bands.csv"

# --- Robust finder for region_summary.csv (Cell 3 output) ---
def _find_region_summary():
    candidates = [
        PROJECT_ROOT / "final_results" / "region_summary.csv",
        APP_ROOT     / "final_results" / "region_summary.csv",
        EXTERNAL_APP / "final_results" / "region_summary.csv",
        # also check if user saved it next to notebook:
        APP_ROOT / "region_summary.csv",
        PROJECT_ROOT / "region_summary.csv",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None

REGION_CSV = _find_region_summary()
print("[INFO] region_summary.csv:", "FOUND -> " + str(REGION_CSV) if REGION_CSV else "NOT FOUND")

# --- Read inputs (defensive defaults) ---
df_counts = pd.read_csv(COUNTS_CSV) if COUNTS_CSV.exists() else pd.DataFrame(
    columns=["sample_id","axis","total","rim","interior"])
df_bands  = pd.read_csv(BANDS_CSV)  if BANDS_CSV.exists()  else pd.DataFrame(
    columns=["sample_id","axis","band","count","share"])

if REGION_CSV is not None:
    df_regions = pd.read_csv(REGION_CSV)  # expects: sample_id, spots, rim, interior, rim_frac
    # normalize columns if needed
    want = ["sample_id","spots","rim","interior","rim_frac"]
    for w in want:
        if w not in df_regions.columns:
            raise ValueError(f"[ERR] region_summary.csv missing column: {w}")
    rim_frac_map   = dict(zip(df_regions["sample_id"], df_regions["rim_frac"]))
    spot_ratio_map = {r["sample_id"]: (float(r["rim"])/max(1.0, float(r["spots"]))) for _, r in df_regions.iterrows()}
else:
    df_regions = pd.DataFrame(columns=["sample_id","spots","rim","interior","rim_frac"])
    rim_frac_map, spot_ratio_map = {}, {}

# --- Show a quick p0 sanity table for the samples in counts ---
def _preview_p0(df_counts, rim_frac_map, spot_ratio_map, n=10):
    sids = df_counts["sample_id"].drop_duplicates().tolist()
    rows = []
    for sid in sids[:n]:
        p0_a = rim_frac_map.get(sid, np.nan)
        p0_b = spot_ratio_map.get(sid, np.nan)
        rows.append({"sample_id": sid, "p0_from_rim_frac": p0_a, "p0_from_spot_ratio": p0_b})
    return pd.DataFrame(rows)

p0_preview = _preview_p0(df_counts, rim_frac_map, spot_ratio_map, n=20)
print("\n[DEBUG] p0 preview (should be ~0.24–0.29 for your FFPE rims slides):")
display(p0_preview)

# --- Parameters ---
B_BOOT = 500   # bootstrap draws for CI
S_NULL = 200   # null draws for reference mean/std
RNG_SEED = 123
rng = np.random.default_rng(RNG_SEED)

# --- Compute WASR / Δedges with CI ---
rows = []
for (sid, axis), g in df_counts.groupby(["sample_id","axis"], dropna=False):
    n_total = int(g["total"].iloc[0]) if not g.empty else 0
    n_rim   = int(g["rim"].iloc[0]) if not g.empty else 0
    n_int   = int(g["interior"].iloc[0]) if not g.empty else 0

    if n_total <= 0:
        rows.append({
            "sample_id": sid, "axis": axis, "n_total": 0, "n_rim": 0, "n_interior": 0,
            "p0": np.nan, "obs_rim_share": np.nan, "WASR": np.nan, "dEdges": 0.0,
            "CI_low": np.nan, "CI_high": np.nan, "null_mean": np.nan, "null_std": np.nan
        })
        continue

    # Baseline p0: prefer region_summary rim_frac, else spot ratio, else 0.5 (last resort)
    p0 = rim_frac_map.get(sid, np.nan)
    if not np.isfinite(p0):
        p0 = spot_ratio_map.get(sid, np.nan)
    if not np.isfinite(p0):
        p0 = 0.5  # final fallback; but for your slides we expect earlier branches to hit

    obs_share = n_rim / n_total
    wasr = obs_share - p0
    d_edges = n_rim - p0 * n_total

    # Bootstrap CI over edge labels (1=rim, 0=interior)
    edge_vec = np.r_[np.ones(n_rim, dtype=np.int8), np.zeros(n_int, dtype=np.int8)]
    boot = []
    for _ in range(B_BOOT):
        samp = rng.choice(edge_vec, size=edge_vec.size, replace=True)
        boot.append(samp.mean() - p0)
    ci_low, ci_high = np.percentile(boot, [2.5, 97.5])

    # Simple Bernoulli(p0) null
    null_wasr = []
    for _ in range(S_NULL):
        samp = rng.binomial(1, p0, size=n_total)
        null_wasr.append(samp.mean() - p0)
    null_mean = float(np.mean(null_wasr))
    null_std  = float(np.std(null_wasr, ddof=1))

    rows.append({
        "sample_id": sid, "axis": axis,
        "n_total": n_total, "n_rim": n_rim, "n_interior": n_int,
        "p0": float(p0), "obs_rim_share": float(obs_share),
        "WASR": float(wasr), "dEdges": float(d_edges),
        "CI_low": float(ci_low), "CI_high": float(ci_high),
        "null_mean": null_mean, "null_std": null_std
    })

df_enrich = pd.DataFrame(rows).sort_values(["sample_id","axis"]).reset_index(drop=True)

# --- Save ---
OUT_CSV = FINAL_DIR / "rim_enrichment.csv"
df_enrich.to_csv(OUT_CSV, index=False)

# --- Display ---
print("\nRim enrichment (head):")
display(df_enrich.head(12))

print("\nPer-sample calls (+: CI_low>0; −: CI_high<0; 0: overlaps 0):")
for sid, g in df_enrich.groupby("sample_id", sort=False):
    calls = []
    for _, r in g.iterrows():
        sign = "+" if r["CI_low"] > 0 else ("-" if r["CI_high"] < 0 else "0")
        calls.append(f'{r["axis"]}: {sign} (WASR={r["WASR"]:.3f}, [{r["CI_low"]:.3f},{r["CI_high"]:.3f}], n={int(r["n_total"])}, p0={r["p0"]:.3f})')
    print(f"[{sid}] " + " | ".join(calls))


[INFO] region_summary.csv: FOUND -> /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/final_results/region_summary.csv

[DEBUG] p0 preview (should be ~0.24–0.29 for your FFPE rims slides):


Unnamed: 0,sample_id,p0_from_rim_frac,p0_from_spot_ratio
0,GSE190811,0.272,0.272236
1,GSE217414,0.246,0.24593
2,GSE226997,0.285,0.285439
3,GSE238004,0.266,0.266279
4,GSE254364,0.283,0.282736
5,GSE267401,0.278,0.278446
6,Human_Breast_Cancer_Block_A_Section_1_2,0.243,0.243023



Rim enrichment (head):


Unnamed: 0,sample_id,axis,n_total,n_rim,n_interior,p0,obs_rim_share,WASR,dEdges,CI_low,CI_high,null_mean,null_std
0,GSE190811,CXCL12_to_CXCR4,19437,4703,14734,0.272,0.241961,-0.030039,-583.864,-0.035931,-0.023733,-0.000261,0.002867
1,GSE190811,SPP1_to_ITG_fam,21789,5164,16625,0.272,0.237,-0.035,-762.608,-0.040765,-0.028734,0.000574,0.002774
2,GSE190811,TGFB1_to_TGFBR_fam,26566,5800,20766,0.272,0.218324,-0.053676,-1425.952,-0.059003,-0.04861,0.000486,0.002719
3,GSE190811,VEGFA_to_KDR,15802,3577,12225,0.272,0.226364,-0.045636,-721.144,-0.051745,-0.039398,-0.000252,0.003522
4,GSE217414,CXCL12_to_CXCR4,164,73,91,0.246,0.445122,0.199122,32.656,0.125951,0.266195,0.001165,0.035088
5,GSE217414,SPP1_to_ITG_fam,2222,439,1783,0.246,0.19757,-0.04843,-107.612,-0.063732,-0.032442,-0.000559,0.009384
6,GSE217414,TGFB1_to_TGFBR_fam,2264,470,1794,0.246,0.207597,-0.038403,-86.944,-0.054326,-0.022502,-0.000574,0.009071
7,GSE217414,VEGFA_to_KDR,294,51,243,0.246,0.173469,-0.072531,-21.324,-0.113347,-0.028313,-0.001221,0.025308
8,GSE226997,CXCL12_to_CXCR4,3501,981,2520,0.285,0.280206,-0.004794,-16.785,-0.021647,0.011772,-0.000304,0.00754
9,GSE226997,SPP1_to_ITG_fam,18312,4456,13856,0.285,0.243338,-0.041662,-762.92,-0.047836,-0.035593,1.7e-05,0.003169



Per-sample calls (+: CI_low>0; −: CI_high<0; 0: overlaps 0):
[GSE190811] CXCL12_to_CXCR4: - (WASR=-0.030, [-0.036,-0.024], n=19437, p0=0.272) | SPP1_to_ITG_fam: - (WASR=-0.035, [-0.041,-0.029], n=21789, p0=0.272) | TGFB1_to_TGFBR_fam: - (WASR=-0.054, [-0.059,-0.049], n=26566, p0=0.272) | VEGFA_to_KDR: - (WASR=-0.046, [-0.052,-0.039], n=15802, p0=0.272)
[GSE217414] CXCL12_to_CXCR4: + (WASR=0.199, [0.126,0.266], n=164, p0=0.246) | SPP1_to_ITG_fam: - (WASR=-0.048, [-0.064,-0.032], n=2222, p0=0.246) | TGFB1_to_TGFBR_fam: - (WASR=-0.038, [-0.054,-0.023], n=2264, p0=0.246) | VEGFA_to_KDR: - (WASR=-0.073, [-0.113,-0.028], n=294, p0=0.246)
[GSE226997] CXCL12_to_CXCR4: 0 (WASR=-0.005, [-0.022,0.012], n=3501, p0=0.285) | SPP1_to_ITG_fam: - (WASR=-0.042, [-0.048,-0.036], n=18312, p0=0.285) | TGFB1_to_TGFBR_fam: - (WASR=-0.049, [-0.057,-0.041], n=12396, p0=0.285) | VEGFA_to_KDR: - (WASR=-0.042, [-0.061,-0.024], n=1947, p0=0.285)
[GSE238004] CXCL12_to_CXCR4: 0 (WASR=nan, [nan,nan], n=0, p0=nan) | 

In [None]:
## Interpretation:
## This robustness panel shows that the FFPE rims findings are highly stable to parameter changes.
## Most slides (GSE190811, GSE217414, GSE254364, Human_Breast_Cancer_Block_A_Section_1_2) had zero or only one flipped call
## when rim thickness was varied by ±25%, and CPM≥1.0 was applied. GSE226997 showed just one flip, and GSE267401 and
## GSE238004 showed 2–3 flips each, concentrated in axes with weak effects (WASR near 0) or low edge counts.
## The expected shift in rim fraction (p0) across variants confirmed the method is functioning correctly
## (e.g., p0 ≈ 0.20 for thinner rims, ≈ 0.29 for base, ≈ 0.36 for thicker rims).
## Overall, this indicates that the observed rim-versus-interior biases are robust and not artifacts of rim boundary placement.
## Flips primarily mark borderline cases rather than systematic sensitivity.


In [6]:
## Cell 6 — Robustness panel (rim thickness ±25%, CPM threshold sensitivity)

# WHAT:
# Test how sensitive the WASR calls are to rim thickness (±25%) and CPM≥1 thresholding,
# recomputing WASR and CIs under each variant to detect unstable (flipped) results.
#
# WHY:
# Small shifts in rim definition or expression threshold could artificially drive rim enrichment.
# This panel ensures results are robust and flags axes where minor parameter changes reverse the call.
#
# HOW IT GETS:
# 1) Reload axis_edge_counts.csv (Cell 4) and rim_enrichment.csv (Cell 5) as the baseline.
# 2) For each sample×axis, re-run edge calling with rim_frac scaled by {0.75, 1.00, 1.25}
#    and CPM threshold at 1.0 (log1p).
# 3) Recompute WASR, CIs, and compare sign vs baseline call; mark flipped=True if sign changes.
#
# WHAT IT GETS:
# - final_results/robustness_sensitivity.csv with columns:
#   sample_id, axis, base_call, base_WASR, var_rim_frac_factor, var_cpm_thres,
#   var_total, var_rim, var_interior, var_p0, var_WASR, var_CI_low, var_CI_high, flipped


from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy import ndimage as ndi
import json, re, warnings

# --- Paths & parameters ---
PROJECT_ROOT = Path("/Users/sally/Desktop/SpatialMMKPNN-Apps2").resolve()
APP_ROOT     = PROJECT_ROOT / "applications" / "02_Stromal_Remodeling_FFPE_Rims"
FINAL_DIR    = PROJECT_ROOT / "final_results"
DATA_DIR     = Path("/Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data").resolve()

# Axes (same as Cell 4)
AXES = [
    {"name": "VEGFA_to_KDR",         "lig": ["VEGFA"],  "rec_prefix": ["KDR"]},
    {"name": "CXCL12_to_CXCR4",      "lig": ["CXCL12"], "rec_prefix": ["CXCR4"]},
    {"name": "TGFB1_to_TGFBR_fam",   "lig": ["TGFB1"],  "rec_prefix": ["TGFBR"]},
    {"name": "SPP1_to_ITG_fam",      "lig": ["SPP1"],   "rec_prefix": ["ITG"]},
]

# Cohort whitelist (same as Cell 4)
SAMPLE_WHITELIST = [
    "GSE190811","GSE217414","GSE226997","GSE238004",
    "GSE254364","GSE267401","GSE274103","Human_Breast_Cancer_Block_A_Section_1_2",
]

# --- Base settings (match Cell 4/5) ---
BASE_KNN_K      = 8
BASE_CPM_TH     = 1.0
BASE_RIM_FRAC   = 0.18          # <-- FIX: use this as the base rim thickness
RIM_FACTORS     = [0.75, 1.00, 1.25]
CPM_VARIANTS    = sorted(set([BASE_CPM_TH, 1.0]))
SMOOTH_IT       = 2
B_BOOT          = 300
RNG_SEED        = 123
rng             = np.random.default_rng(RNG_SEED)

# --- Load base calls from Cell 5 ---
enrich_csv = FINAL_DIR / "rim_enrichment.csv"
counts_csv = FINAL_DIR / "axis_edge_counts.csv"
df_base = pd.read_csv(enrich_csv) if enrich_csv.exists() else pd.DataFrame(
    columns=["sample_id","axis","WASR","CI_low","CI_high"])
df_counts = pd.read_csv(counts_csv) if counts_csv.exists() else pd.DataFrame(
    columns=["sample_id","axis","total","rim","interior"])

def _sign_from_ci(lo, hi):
    if pd.isna(lo) or pd.isna(hi): return "NA"
    if lo > 0: return "+"
    if hi < 0: return "-"
    return "0"

base_call = (df_base.assign(base_call=lambda d: [_sign_from_ci(lo, hi) for lo,hi in zip(d["CI_low"], d["CI_high"])])
                     .set_index(["sample_id","axis"])[["base_call","WASR"]]
                     .rename(columns={"WASR":"base_WASR"}))

# --- Helpers from Cell 4 (dup-gene safe) ---
def _find_positions(sample_dir: Path) -> Path:
    poss = sorted([p for p in sample_dir.rglob("tissue_positions*") if p.suffix.lower() in (".csv",".tsv")],
                  key=lambda p: (0 if "spatial" in p.as_posix() else 1, len(p.as_posix())))
    if not poss: raise FileNotFoundError(f"No tissue_positions*.csv/tsv under {sample_dir}")
    return poss[0]

def _load_spatial_and_scale(sample_dir: Path) -> pd.DataFrame:
    posf = _find_positions(sample_dir)
    try:
        df = pd.read_csv(posf, header=None)
        if df.shape[1] >= 6:
            df = df.iloc[:, :6]
            df.columns = ["barcode","in_tissue","arr_row","arr_col","px","py"]
        else:
            raise ValueError
    except Exception:
        df = pd.read_csv(posf, sep=None, engine="python")
        mapping = {}
        for c in df.columns:
            lc = c.lower()
            if "barcode" in lc: mapping[c] = "barcode"
            elif "in_tissue" in lc: mapping[c] = "in_tissue"
            elif "row" in lc: mapping[c] = "arr_row"
            elif "col" in lc: mapping[c] = "arr_col"
            elif lc in ("x","px"): mapping[c] = "px"
            elif lc in ("y","py"): mapping[c] = "py"
        df = df.rename(columns=mapping)
        for need in ["barcode","in_tissue","arr_row","arr_col","px","py"]:
            if need not in df.columns: df[need] = np.nan
        df = df[["barcode","in_tissue","arr_row","arr_col","px","py"]]

    scale = 100.0
    sf_cands = list(sample_dir.rglob("scalefactors_json.json"))
    if sf_cands:
        try:
            sf = json.loads(sf_cands[0].read_text())
            if "spot_diameter_fullres" in sf and np.isfinite(sf["spot_diameter_fullres"]):
                scale = float(sf["spot_diameter_fullres"]) * 0.9
        except Exception:
            pass

    if df["px"].isna().any() or df["py"].isna().any():
        for c in ["arr_row","arr_col"]:
            if not np.issubdtype(df[c].dtype, np.number):
                df[c] = pd.to_numeric(df[c], errors="coerce")
        df["px"] = df["arr_col"].astype(float) * scale
        df["py"] = df["arr_row"].astype(float) * scale

    if df["in_tissue"].notna().any():
        df = df[df["in_tissue"] == 1].copy()

    return df[["barcode","px","py","arr_row","arr_col"]]

def _read_counts(sample_dir: Path):
    import scanpy as sc
    mtx_dir = None
    for cand in ["filtered_feature_bc_matrix","raw_feature_bc_matrix"]:
        hit = list(sample_dir.rglob(f"{cand}/matrix.mtx*"))
        if hit:
            mtx_dir = hit[0].parent
            break
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=UserWarning)
        if mtx_dir is not None:
            adata = sc.read_10x_mtx(mtx_dir, var_names="gene_symbols", make_unique=True)
        else:
            h5_hits = sorted([p for p in sample_dir.rglob("*.h5")])
            if h5_hits:
                try:
                    adata = sc.read_10x_h5(h5_hits[0], gex_only=True)
                except TypeError:
                    adata = sc.read_10x_h5(h5_hits[0])
            else:
                any_mtx = sorted([p for p in sample_dir.rglob("matrix.mtx*")])
                if not any_mtx:
                    raise FileNotFoundError("No 10x matrix found (MTX or H5).")
                adata = sc.read_10x_mtx(any_mtx[0].parent, var_names="gene_symbols", make_unique=True)

    adata.var_names_make_unique()
    adata.var_names = adata.var_names.astype(str)
    adata.obs_names = adata.obs_names.astype(str)

    if "counts" in adata.layers:
        counts = adata.layers["counts"]
        if hasattr(counts, "toarray"): counts = counts.toarray()
        counts = counts.astype(np.float64)
        lib = counts.sum(axis=1, keepdims=True); lib[lib==0] = 1.0
        X_cpm = (counts / lib) * 1e6
    else:
        X = adata.X
        if hasattr(X, "toarray"): X = X.toarray()
        X = X.astype(np.float64)
        mx = np.nanmax(X) if X.size else 0.0
        if mx > 50:
            lib = X.sum(axis=1, keepdims=True); lib[lib==0] = 1.0
            X_cpm = (X / lib) * 1e6
        else:
            X_cpm = np.expm1(X)
    return list(adata.obs_names), list(adata.var_names), X_cpm.astype(np.float32)

def _knn_edges(px, py, k=8):
    coords = np.c_[px, py]
    nbrs = NearestNeighbors(n_neighbors=min(k+1, len(coords))).fit(coords)
    neigh = nbrs.kneighbors(return_distance=False)
    rows, cols = [], []
    for i in range(neigh.shape[0]):
        for j in neigh[i, 1:]:
            a, b = (i, j) if i < j else (j, i)
            rows.append(a); cols.append(b)
    return np.unique(np.c_[rows, cols], axis=0)

def _inside_distance_raster(px, py, smooth_it=2):
    coords = np.c_[px, py]
    if len(coords) < 2: return np.zeros(len(coords))
    nbrs = NearestNeighbors(n_neighbors=2).fit(coords)
    dists, _ = nbrs.kneighbors(return_distance=True)
    step = np.median(dists[:,1]) / 2.0
    if not np.isfinite(step) or step <= 0: step = 50.0
    xmin, ymin = coords.min(axis=0); xmax, ymax = coords.max(axis=0)
    nx = int(np.ceil((xmax-xmin)/step)) + 6
    ny = int(np.ceil((ymax-ymin)/step)) + 6
    gx = np.clip(((coords[:,0]-xmin)/step).astype(int)+3, 0, nx-1)
    gy = np.clip(((coords[:,1]-ymin)/step).astype(int)+3, 0, ny-1)
    grid = np.zeros((ny, nx), dtype=bool); grid[gy, gx] = True
    grid = ndi.binary_dilation(grid, iterations=SMOOTH_IT)
    grid = ndi.binary_closing(grid, iterations=SMOOTH_IT)
    dist_in = ndi.distance_transform_edt(grid)
    inside = dist_in[gy, gx].astype(float)
    m = inside.max() if inside.size else 0.0
    return inside/m if m>0 else np.zeros_like(inside)

def _expand_prefixes(gene_list, prefixes):
    out = set()
    for p in prefixes:
        p = str(p)
        out.update([g for g in gene_list if g.startswith(p)])
    return sorted(out)

# --- Variant evaluator (FIXED rim threshold) ---
def _evaluate_variants_for_sample(sample_id, sample_dir, X_cpm, genes, barcodes, pos_df,
                                  rim_factors, cpm_variants, knn_k=8):
    pos = pos_df.set_index("barcode")
    common = pos.index.intersection(pd.Index(barcodes))
    if len(common)==0:
        raise ValueError(f"No overlapping barcodes for {sample_id}")
    b2i = {b:i for i,b in enumerate(barcodes)}
    idx = np.array([b2i[b] for b in common], dtype=int)
    pos = pos.loc[common].copy()
    X   = X_cpm[idx, :]
    g2i = {g:i for i,g in enumerate(genes)}
    px, py = pos["px"].values, pos["py"].values

    edges_und = _knn_edges(px, py, k=knn_k)
    directed  = np.vstack([np.c_[edges_und[:,0], edges_und[:,1]],
                           np.c_[edges_und[:,1], edges_und[:,0]]])

    # Compute inside-distance ONCE (it's normalized 0..1), then vary the rim threshold.
    inside_d = _inside_distance_raster(px, py, smooth_it=SMOOTH_IT)
    id_max = inside_d.max() if inside_d.size else 1.0

    results = []
    for rf in rim_factors:
        # FIX: threshold uses base rim frac scaled by factor, capped at max
        thr = min(1.0, (BASE_RIM_FRAC * rf)) * id_max  # <-- FIX
        rim_mask = inside_d <= thr
        region = np.where(rim_mask, "rim", "interior")
        p0 = float(rim_mask.mean()) if rim_mask.size else np.nan  # geometric baseline for this variant

        for cpm_th in cpm_variants:
            rows_counts = []
            for axis in AXES:
                name = axis["name"]
                ligs = [g for g in axis["lig"] if g in g2i]
                recs = [r for r in _expand_prefixes(genes, axis["rec_prefix"]) if r in g2i]
                if len(ligs)==0 or len(recs)==0:
                    rows_counts.append({"axis": name, "total":0, "rim":0, "interior":0})
                    continue
                lig_mat = np.max(np.column_stack([X[:, g2i[g]] for g in ligs]), axis=1) if len(ligs)>1 else X[:, g2i[ligs[0]]]
                rec_mat = np.max(np.column_stack([X[:, g2i[r]] for r in recs]), axis=1) if len(recs)>1 else X[:, g2i[recs[0]]]
                src_ok, tgt_ok = lig_mat>=cpm_th, rec_mat>=cpm_th
                src_idx, tgt_idx = directed[:,0], directed[:,1]
                keep = src_ok[src_idx] & tgt_ok[tgt_idx]
                called = directed[keep]
                if called.shape[0]==0:
                    rows_counts.append({"axis": name, "total":0, "rim":0, "interior":0})
                else:
                    src_regions = region[called[:,0]]
                    n_total = called.shape[0]
                    n_rim = int((src_regions=="rim").sum())
                    rows_counts.append({"axis": name, "total":int(n_total), "rim":n_rim, "interior":int(n_total-n_rim)})

            # Convert to WASR/CI rows
            for rc in rows_counts:
                n_total = rc["total"]; n_rim = rc["rim"]; n_int = rc["interior"]
                if n_total <= 0 or not np.isfinite(p0):
                    results.append({
                        "sample_id": sample_id, "axis": rc["axis"],
                        "var_rim_frac_factor": rf, "var_cpm_thres": float(cpm_th),
                        "var_total": int(n_total), "var_rim": int(n_rim), "var_interior": int(n_int),
                        "var_p0": float(p0) if np.isfinite(p0) else np.nan,
                        "var_WASR": np.nan, "var_CI_low": np.nan, "var_CI_high": np.nan
                    })
                else:
                    obs_share = n_rim / n_total
                    wasr = obs_share - p0
                    edge_vec = np.r_[np.ones(n_rim, dtype=np.int8), np.zeros(n_int, dtype=np.int8)]
                    boot = []
                    for _ in range(B_BOOT):
                        samp = rng.choice(edge_vec, size=edge_vec.size, replace=True)
                        boot.append(samp.mean() - p0)
                    ci_low, ci_high = np.percentile(boot, [2.5, 97.5])
                    results.append({
                        "sample_id": sample_id, "axis": rc["axis"],
                        "var_rim_frac_factor": rf, "var_cpm_thres": float(cpm_th),
                        "var_total": int(n_total), "var_rim": int(n_rim), "var_interior": int(n_int),
                        "var_p0": float(p0), "var_WASR": float(wasr),
                        "var_CI_low": float(ci_low), "var_CI_high": float(ci_high)
                    })
    return results

# --- Run across whitelist ---
present = {p.name for p in DATA_DIR.iterdir() if p.is_dir()}
samples = [s for s in SAMPLE_WHITELIST if s in present]
missing  = [s for s in SAMPLE_WHITELIST if s not in present]
print("[ROBUSTNESS] Samples to process:", samples)
if missing:
    print("[WARN] Missing from DATA_DIR:", missing)

all_rows = []
for sn in samples:
    sdir = DATA_DIR / sn
    try:
        pos_df = _load_spatial_and_scale(sdir)
        barcodes, genes, X_cpm = _read_counts(sdir)
        rows = _evaluate_variants_for_sample(sn, sdir, X_cpm, genes, barcodes, pos_df,
                                             rim_factors=RIM_FACTORS, cpm_variants=CPM_VARIANTS,
                                             knn_k=BASE_KNN_K)
        all_rows.extend(rows)
        print(f"[OK] Robustness variants computed — {sn}")
    except Exception as e:
        print(f"[WARN] Skipped {sn}: {e}")

df_var = pd.DataFrame(all_rows)

# Join with base calls
df_base_calls = (df_base.assign(base_call=lambda d: [_sign_from_ci(lo, hi) for lo,hi in zip(d["CI_low"], d["CI_high"])])
                        .rename(columns={"WASR":"base_WASR"}))[["sample_id","axis","base_call","base_WASR"]]
df_join = (df_var.merge(df_base_calls, how="left", on=["sample_id","axis"])
                 .assign(var_call=lambda d: [_sign_from_ci(lo, hi) for lo,hi in zip(d["var_CI_low"], d["var_CI_high"])])
                 .assign(flipped=lambda d: (d["base_call"] != d["var_call"]) & d["base_call"].notna() & d["var_call"].notna()))

# Save
OUT_CSV = FINAL_DIR / "robustness_sensitivity.csv"
df_join.to_csv(OUT_CSV, index=False)

# Console summary
print("\nRobustness summary — flipped counts by sample:")
summ = (df_join.groupby("sample_id")["flipped"].sum().reset_index()
               .rename(columns={"flipped":"n_flipped"}))
display(summ)

print("\nRobustness preview (head):")
display(df_join.head(20))


[ROBUSTNESS] Samples to process: ['GSE190811', 'GSE217414', 'GSE226997', 'GSE238004', 'GSE254364', 'GSE267401', 'GSE274103', 'Human_Breast_Cancer_Block_A_Section_1_2']
[OK] Robustness variants computed — GSE190811
[OK] Robustness variants computed — GSE217414
[OK] Robustness variants computed — GSE226997
[OK] Robustness variants computed — GSE238004
[OK] Robustness variants computed — GSE254364
[OK] Robustness variants computed — GSE267401
[WARN] Skipped GSE274103: Did not find file /Users/sally/Desktop/Stromal_Remodeling_&_Resistance_FFPE_rims/data/GSE274103/GSM8443453/filtered_feature_bc_matrix/matrix.mtx.gz.
[OK] Robustness variants computed — Human_Breast_Cancer_Block_A_Section_1_2

Robustness summary — flipped counts by sample:


Unnamed: 0,sample_id,n_flipped
0,GSE190811,0
1,GSE217414,0
2,GSE226997,1
3,GSE238004,2
4,GSE254364,0
5,GSE267401,3
6,Human_Breast_Cancer_Block_A_Section_1_2,1



Robustness preview (head):


Unnamed: 0,sample_id,axis,var_rim_frac_factor,var_cpm_thres,var_total,var_rim,var_interior,var_p0,var_WASR,var_CI_low,var_CI_high,base_call,base_WASR,var_call,flipped
0,GSE190811,VEGFA_to_KDR,0.75,1.0,15802,2347,13455,0.202524,-0.053999,-0.058844,-0.048173,-,-0.045636,-,False
1,GSE190811,CXCL12_to_CXCR4,0.75,1.0,19437,2999,16438,0.202524,-0.048231,-0.053221,-0.042363,-,-0.030039,-,False
2,GSE190811,TGFB1_to_TGFBR_fam,0.75,1.0,26566,3691,22875,0.202524,-0.063587,-0.067674,-0.059577,-,-0.053676,-,False
3,GSE190811,SPP1_to_ITG_fam,0.75,1.0,21789,3436,18353,0.202524,-0.04483,-0.049422,-0.039914,-,-0.035,-,False
4,GSE190811,VEGFA_to_KDR,1.0,1.0,15802,3577,12225,0.286058,-0.059694,-0.065612,-0.053076,-,-0.045636,-,False
5,GSE190811,CXCL12_to_CXCR4,1.0,1.0,19437,4703,14734,0.286058,-0.044096,-0.049869,-0.037815,-,-0.030039,-,False
6,GSE190811,TGFB1_to_TGFBR_fam,1.0,1.0,26566,5800,20766,0.286058,-0.067734,-0.074024,-0.062444,-,-0.053676,-,False
7,GSE190811,SPP1_to_ITG_fam,1.0,1.0,21789,5164,16625,0.286058,-0.049057,-0.054757,-0.042927,-,-0.035,-,False
8,GSE190811,VEGFA_to_KDR,1.25,1.0,15802,4592,11210,0.356771,-0.066175,-0.074561,-0.059358,-,-0.045636,-,False
9,GSE190811,CXCL12_to_CXCR4,1.25,1.0,19437,6265,13172,0.356771,-0.034447,-0.040959,-0.028092,-,-0.030039,-,False


In [None]:
## Interpretation (Cell 6 — robustness to rim thickness & CPM threshold):
## Calls are broadly stable: most slides show 0–1 flipped CI sign across variants, with a few borderline flips on weak-effect/low-n axes.
## Recomputing the geometric baseline p0 for each rim factor behaved correctly (p0 rises with thicker rims, drops with thinner rims),
## so changes in WASR reflect biological signal sensitivity rather than artifacts of geometry.
## Where flips occur (e.g., select axes in GSE267401 and GSE238004), they cluster near WASR≈0 and should be interpreted cautiously.
## GSE274103 was skipped due to a missing matrix path and is not included in robustness tallies.
## Overall, the rim-versus-interior conclusions are resilient to ±25% rim thickness changes under CPM≥1.0, supporting the main findings.


In [9]:
# Cell 7 — Compact narrative & “how to audit” checklist (FFPE rims)
# Rationale:
# - Summarize the full FFPE rims analysis from Cells 3–6 in a human-readable form.
# - Provides both a per-sample digest (WASR, CI, n) and a per-axis tally of calls (+/0/−).
# - Includes a robustness section showing flipped counts from Cell 6.
# - Helps future readers quickly interpret results and see how to re-run or audit them.
#
# Output:
# - final_results/FFPE_rims_summary.md  ← summary markdown for README
# - console preview (first ~60 lines)

from pathlib import Path
import pandas as pd
import numpy as np

# Optional: tabulate support for markdown tables
try:
    import tabulate
    _HAVE_TABULATE = True
except ImportError:
    _HAVE_TABULATE = False

PROJECT_ROOT = Path("/Users/sally/Desktop/SpatialMMKPNN-Apps2").resolve()
FINAL_DIR    = PROJECT_ROOT / "final_results"
FINAL_DIR.mkdir(parents=True, exist_ok=True)

# ---- Load CSVs ----
def _read_csv(p, cols=None):
    if p.exists():
        df = pd.read_csv(p)
        if cols:
            for c in cols:
                if c not in df.columns:
                    raise ValueError(f"Missing column '{c}' in {p}")
        return df
    return pd.DataFrame(columns=cols or [])

df_counts = _read_csv(FINAL_DIR/"axis_edge_counts.csv",
                      ["sample_id","axis","total","rim","interior"])
df_bands  = _read_csv(FINAL_DIR/"axis_bands.csv",
                      ["sample_id","axis","band","count","share"])
df_enr    = _read_csv(FINAL_DIR/"rim_enrichment.csv",
                      ["sample_id","axis","n_total","n_rim","n_interior","p0",
                       "obs_rim_share","WASR","dEdges","CI_low","CI_high",
                       "null_mean","null_std"])
df_rob    = _read_csv(FINAL_DIR/"robustness_sensitivity.csv",
                      ["sample_id","axis","base_call","base_WASR","var_rim_frac_factor",
                       "var_cpm_thres","var_total","var_rim","var_interior","var_p0",
                       "var_WASR","var_CI_low","var_CI_high","flipped","var_call"])

# ---- Derive base_call from CI if needed ----
def _sign_from_ci(lo, hi):
    if pd.isna(lo) or pd.isna(hi): return "NA"
    if lo > 0: return "+"
    if hi < 0: return "-"
    return "0"

if "base_call" not in df_enr.columns:
    df_enr = df_enr.assign(base_call=[_sign_from_ci(lo, hi) for lo,hi in zip(df_enr["CI_low"], df_enr["CI_high"])])

# ---- Per-axis tally ----
axis_tallies = (df_enr.groupby(["axis","base_call"])
                     .size()
                     .unstack(fill_value=0)
                     .reindex(columns=["+","0","-","NA"], fill_value=0)
                     .reset_index()
               )

# ---- Per-sample digest ----
sample_lines = []
for sid, g in df_enr.groupby("sample_id", sort=False):
    parts = []
    for _, r in g.sort_values("axis").iterrows():
        parts.append(f'{r["axis"]}: {r["base_call"]} (WASR={r["WASR"]:.3f}, CI=[{r["CI_low"]:.3f},{r["CI_high"]:.3f}], n={int(r["n_total"])}, p0={r["p0"]:.3f})')
    sample_lines.append(f"- **{sid}** — " + " | ".join(parts))

# ---- Robustness summary ----
if not df_rob.empty:
    flip_summary = (df_rob.groupby("sample_id")["flipped"]
                           .apply(lambda s: int(np.nansum(s.astype(bool))))
                           .reset_index()
                           .rename(columns={"flipped":"n_flipped"}))
    flips_total = int(np.nansum(df_rob["flipped"].astype(bool))) if "flipped" in df_rob.columns else 0
else:
    flip_summary = pd.DataFrame(columns=["sample_id","n_flipped"])
    flips_total = 0

# ---- Detect skipped datasets ----
skipped = []
for sid, g in df_counts.groupby("sample_id", sort=False):
    if int(g["total"].sum()) == 0:
        skipped.append(sid)

# ---- Assemble markdown ----
lines = []
lines.append("# FFPE Rims: SpatialMMKPNN readout\n")
lines.append("**Cohort (final whitelist):** GSE190811, GSE217414, GSE226997, GSE238004, GSE254364, GSE267401, GSE274103, Human_Breast_Cancer_Block_A_Section_1_2.\n")
if skipped:
    lines.append(f"**Skipped during edge calling:** {', '.join(skipped)}.\n")

lines.append("## What we measure\n")
lines.append("- Count directed ligand→receptor edges over a geometry-only kNN graph (k=8).")
lines.append("- Classify source spots as rim vs interior using distance-to-edge (RIM_FRAC≈0.18).")
lines.append("- WASR = observed rim share − baseline p0 (p0 = rim fraction from Cell 3). CI via bootstrap.\n")

lines.append("## Per-axis tally across samples (calls by CI)\n")
if not axis_tallies.empty:
    if _HAVE_TABULATE:
        lines.append(axis_tallies.rename(columns={"+":"plus","0":"zero","-":"minus","NA":"na"}).to_markdown(index=False))
    else:
        lines.append(axis_tallies.to_csv(index=False))
else:
    lines.append("_No edges found; table empty._")

lines.append("\n## Per-sample digest\n")
lines.extend(sample_lines if sample_lines else ["_No samples with edges to summarize._"])

lines.append("\n## Robustness (±25% rim thickness; CPM≥1.0)\n")
if not df_rob.empty:
    lines.append(f"- **Total flipped calls:** {flips_total}")
    if not flip_summary.empty:
        if _HAVE_TABULATE:
            lines.append(flip_summary.to_markdown(index=False))
        else:
            lines.append(flip_summary.to_csv(index=False))
    lines.append("- Most calls are stable to rim thickness changes; flips mark borderline cases.")
else:
    lines.append("_Robustness panel not available._")

lines.append("\n## How to audit this notebook\n")
audit = [
    "Re-run Cell 3 to confirm rim fractions in `region_summary.csv` (~0.24–0.29).",
    "Re-run Cell 4 to rebuild edges and verify `axis_edge_counts.csv` totals.",
    "Re-run Cell 5 to recompute WASR and bootstrap CI from edge labels.",
    "Re-run Cell 6 to check rim thickness sensitivity (`robustness_sensitivity.csv`).",
    "Compare Cell 4/5 edge totals with spot counts to detect possible sparsity artifacts.",
    "Spot-check CPM≥1 thresholding on rim vs interior barcodes for a few axes."
]
lines.extend([f"- {x}" for x in audit])

# ---- Write & preview ----
out_md = FINAL_DIR / "FFPE_rims_summary.md"
out_md.write_text("\n".join(lines), encoding="utf-8")
print(f"[WRITE] {out_md}")
print("\n".join(lines[:60]) + ("\n...\n" if len(lines)>60 else ""))


[WRITE] /Users/sally/Desktop/SpatialMMKPNN-Apps2/final_results/FFPE_rims_summary.md
# FFPE Rims: SpatialMMKPNN readout

**Cohort (final whitelist):** GSE190811, GSE217414, GSE226997, GSE238004, GSE254364, GSE267401, GSE274103, Human_Breast_Cancer_Block_A_Section_1_2.

## What we measure

- Count directed ligand→receptor edges over a geometry-only kNN graph (k=8).
- Classify source spots as rim vs interior using distance-to-edge (RIM_FRAC≈0.18).
- WASR = observed rim share − baseline p0 (p0 = rim fraction from Cell 3). CI via bootstrap.

## Per-axis tally across samples (calls by CI)

axis,+,0,-,NA
CXCL12_to_CXCR4,3,1,2,1
SPP1_to_ITG_fam,1,0,6,0
TGFB1_to_TGFBR_fam,1,0,6,0
VEGFA_to_KDR,1,1,5,0


## Per-sample digest

- **GSE190811** — CXCL12_to_CXCR4: - (WASR=-0.030, CI=[-0.036,-0.024], n=19437, p0=0.272) | SPP1_to_ITG_fam: - (WASR=-0.035, CI=[-0.041,-0.029], n=21789, p0=0.272) | TGFB1_to_TGFBR_fam: - (WASR=-0.054, CI=[-0.059,-0.049], n=26566, p0=0.272) | VEGFA_to_KDR: - (WASR=-0.046, CI

In [11]:
#### Cell 8 — Minimal plots
# WHAT: Two quick matplotlib plots — (A) WASR±CI per axis/sample, (B) band (B1–B5) shares for one sample.
# WHY: Give a fast visual check of Cell 5–6 outputs and create assets for the README.
# HOW: Load rim_enrichment.csv + axis_bands.csv, plot, and save to assets/.
# GETS: ffpe_rims_wasr_by_axis.png and ffpe_rims_bands_example.png


from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# --- Paths ---
PROJECT_ROOT = Path("/Users/sally/Desktop/SpatialMMKPNN-Apps2").resolve()
FINAL_DIR    = PROJECT_ROOT / "final_results"
ASSETS_DIR   = PROJECT_ROOT / "assets"
ASSETS_DIR.mkdir(parents=True, exist_ok=True)

enr_csv   = FINAL_DIR / "rim_enrichment.csv"
bands_csv = FINAL_DIR / "axis_bands.csv"

# --- Read inputs (fail-fast with clear hints) ---
if not enr_csv.exists():
    raise FileNotFoundError(f"Missing {enr_csv}. Run Cells 4–5 first.")
df_enr = pd.read_csv(enr_csv)

if not bands_csv.exists():
    df_bands = pd.DataFrame(columns=["sample_id","axis","band","count","share"])
else:
    df_bands = pd.read_csv(bands_csv)

# ---------------- (A) Per-sample WASR by axis with 95% CIs ----------------
# tidy: axis order for consistent plotting
axis_order = ["VEGFA_to_KDR","CXCL12_to_CXCR4","TGFB1_to_TGFBR_fam","SPP1_to_ITG_fam"]
df_enr["axis"] = pd.Categorical(df_enr["axis"], categories=axis_order, ordered=True)
df_enr = df_enr.sort_values(["sample_id","axis"])

samples = df_enr["sample_id"].dropna().unique().tolist()
if len(samples) == 0:
    raise ValueError("No rows in rim_enrichment.csv to plot. Did Cell 5 produce results?")

# figure sizing: one small row per sample
fig_h = max(2.5, 0.9 * len(samples))
fig, axes = plt.subplots(nrows=len(samples), ncols=1, figsize=(8, fig_h), sharex=True)
if len(samples) == 1:
    axes = [axes]

for ax, sid in zip(axes, samples):
    g = df_enr[df_enr["sample_id"] == sid].copy()
    g = g.sort_values("axis")
    x = np.arange(len(g))
    y = g["WASR"].values
    lo = g["CI_low"].values
    hi = g["CI_high"].values
    # guard: if any CI bound is NaN, set symmetric zero-length errorbar (renders points but no bar)
    yerr_lower = np.where(np.isfinite(lo), y - lo, 0.0)
    yerr_upper = np.where(np.isfinite(hi), hi - y, 0.0)

    ax.axhline(0, linestyle="--", linewidth=1)
    ax.errorbar(x, y, yerr=[yerr_lower, yerr_upper], fmt="o", capsize=3)
    ax.set_ylabel(sid, rotation=0, ha="right", va="center")
    ax.set_xlim(-0.5, max(-0.5, len(x) - 0.5))

axes[-1].set_xticks(range(len(axis_order)))
axes[-1].set_xticklabels(axis_order, rotation=20, ha="right")
fig.suptitle("FFPE rims — WASR by axis (95% CI)")
fig.tight_layout(rect=[0, 0, 1, 0.97])

out_a = ASSETS_DIR / "ffpe_rims_wasr_by_axis.png"
fig.savefig(out_a, dpi=150)
plt.close(fig)
print(f"[WRITE] {out_a}")

# ---------------- (B) Distance-band shares (B1..B5) for one sample ----------------
# choose a representative sample that appears in bands; fallback to first sample if needed
rep_sid = None
bands_sids = df_bands["sample_id"].dropna().unique().tolist()
for sid in samples:
    if sid in bands_sids:
        rep_sid = sid
        break
if rep_sid is None and len(bands_sids) > 0:
    rep_sid = bands_sids[0]

if rep_sid is not None:
    g = df_bands[df_bands["sample_id"] == rep_sid].copy()
    if g.empty:
        print(f"[WARN] No band rows for {rep_sid}; skipping band plot.")
    else:
        # canonical band order
        band_order = ["B1:0-10%","B2:10-25%","B3:25-50%","B4:50-75%","B5:75-100%"]
        g["band"] = pd.Categorical(g["band"], categories=band_order, ordered=True)
        g = g.sort_values(["axis","band"])
        axes_u = g["axis"].dropna().unique().tolist()

        fig2, ax2 = plt.subplots(figsize=(8, 3.0 + 0.4 * max(1, len(axes_u))))
        # stacked horizontal bars by axis
        bottom = np.zeros(len(axes_u))
        for b in band_order:
            row = (g[g["band"] == b]
                   .set_index("axis")
                   .reindex(axes_u)["share"]
                   .fillna(0.0)
                   .values)
            ax2.barh(axes_u, row, left=bottom, label=b)
            bottom += row
        ax2.set_xlim(0, 1.0)
        ax2.set_xlabel("Share of edges")
        ax2.set_title(f"Edge distance bands by axis — {rep_sid}")
        # NOTE: older matplotlib uses 'ncol' (singular)
        ax2.legend(title="Bands", ncol=3, fontsize=8)
        fig2.tight_layout()

        out_b = ASSETS_DIR / "ffpe_rims_bands_example.png"
        fig2.savefig(out_b, dpi=150)
        plt.close(fig2)
        print(f"[WRITE] {out_b}")
else:
    print("[WARN] No sample available for band plot (axis_bands.csv is empty or lacks matching sample ids).")


[WRITE] /Users/sally/Desktop/SpatialMMKPNN-Apps2/assets/ffpe_rims_wasr_by_axis.png
[WRITE] /Users/sally/Desktop/SpatialMMKPNN-Apps2/assets/ffpe_rims_bands_example.png


In [None]:
## Interpretation (Cell 8 — minimal plots):
## The WASR-by-axis panel summarizes Cell 5 results visually: points are WASR and bars are 95% CIs.
## Axes with CIs wholly below 0 indicate interior-weighted sources (common for SPP1→ITG, TGFB1→TGFBR, VEGFA→KDR);
## axes with CIs crossing 0 are borderline/neutral; CIs above 0 indicate rim enrichment (seen on a minority for CXCL12→CXCR4).
## Stacked band plots (B1..B5) for the representative sample show where edges concentrate relative to the boundary:
## higher B1/B2 segments indicate boundary-skewed interactions; more B3–B5 indicates interior bias.
## Together, these visuals corroborate the textual calls: most slides show interior-biased signaling,
## with a few rim-enriched exceptions, and borderline cases present as points whose CIs overlap zero.
## Plots were written to assets/ (ffpe_rims_wasr_by_axis.png, ffpe_rims_bands_example.png) for README inclusion.


In [None]:
## Biological interpretation — FFPE rims (Stromal Remodeling & Therapy Resistance):

## Most slides show interior-biased signaling for SPP1→ITG, TGFB1→TGFBR*, and VEGFA→KDR,
## suggesting stromal remodeling, fibrosis, and angiogenic activity are maintained deep
## within the tumor–stroma interface rather than confined to the margin.
## CXCL12→CXCR4 shows rim enrichment only on a few slides, indicating boundary chemokine
## gradients are not universal but may mark localized immune/stromal gating.
## Overall, this points to a diffuse, therapy-resistant stroma where matrix remodeling
## and TGF-β–driven reprogramming extend inward, while margin chemokine barriers appear
## sporadically across cases.
