## 1) Configure paths
Edit these paths to point to your local files.

In [2]:

import os
import pandas as pd
from pathlib import Path

# ---------------- paths ----------------
# Your Google Drive base
BASE = Path("/Users/scottpowers/Library/CloudStorage/GoogleDrive-Scott.Powers@stonybrook.edu/My Drive/Imputed Folder")

# DepMap (update these two if you’ve moved them)
DEP_EXP   = BASE / "PDAC_celllines_expression.csv"      # genes x cell_lines
DEP_CHRON = BASE / "PDAC_gene_dependencies.csv"         # genes x cell_lines

# Use the outputs we just generated on Desktop
PRISM_LONG   = Path("~/Desktop/PRISM_AUC_PDAC_long.csv").expanduser()     # depmap_id, ccle_name, compound, auc
PRISM_MATRIX = Path("~/Desktop/PRISM_AUC_PDAC_matrix.csv").expanduser()   # rows=cell_lines, cols=compounds
PRISM_PARAMS = Path("~/Desktop/secondary-screen-dose-response-curve-parameters.csv").expanduser()  # for target/MOA



# Output dir
OUTDIR = Path("./out_modules_notebook")
OUTDIR.mkdir(parents=True, exist_ok=True)

# ---------------- helpers ----------------
def read_any(path: Path) -> pd.DataFrame:
    """Smart CSV/TSV reader with header guess; returns DataFrame."""
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")
    try:
        return pd.read_csv(path)
    except Exception:
        try:
            return pd.read_csv(path, sep="\t")
        except Exception:
            return pd.read_csv(path, sep=None, engine="python")

# ---------------- load PRISM ----------------
# Matrix we made is cell_lines x compounds; your code wants drugs x cell_lines
A = read_any(PRISM_MATRIX)
# Ensure the first column is the index if it got saved with an unnamed index
if A.columns[0].lower() in {"unnamed: 0", ""}:
    A = A.set_index(A.columns[0])

# A: rows = cell_lines, cols = compounds
# Convert to drugs x cell_lines
DEP_PRISM = A.T.copy()
DEP_PRISM.index.name = "compound"     # drugs
DEP_PRISM.columns.name = "ccle_name"  # cell lines

# Optional: build annotation from the params file (target/MOA)
annot = None
if PRISM_PARAMS.exists():
    params = read_any(PRISM_PARAMS)
    # normalize headers
    params.columns = [str(c).strip().lower() for c in params.columns]
    # In your file drug name is 'name'; align to 'compound'
    cols_to_keep = [c for c in ["name", "compound", "target", "moa"] if c in params.columns]
    if "name" in cols_to_keep and "compound" not in cols_to_keep:
        params = params.rename(columns={"name": "compound"})
    annot = params[["compound"] + [c for c in ["target", "moa"] if c in params.columns]].drop_duplicates()
    # Keep only drugs present in the matrix
    annot = annot[annot["compound"].isin(DEP_PRISM.index)].set_index("compound")
    annot.to_csv(OUTDIR / "PRISM_drug_annotations.csv")

# Save the drug x cell_lines AUC matrix for downstream code expecting that orientation
DEP_PRISM_PATH = OUTDIR / "PRISM_AUC_PDAC_drugs_by_celllines.csv"
DEP_PRISM.to_csv(DEP_PRISM_PATH)

print("PRISM (drugs x cell_lines):", DEP_PRISM.shape, "->", DEP_PRISM_PATH)

# ---------------- load expression/dependency (optional, if paths are real) ---
def try_load(label, p: Path):
    try:
        df = read_any(p)
        print(f"{label}:", df.shape, "from", p)
        return df
    except FileNotFoundError:
        print(f"{label}: MISSING -> {p}")
        return None

EXP   = try_load("Expression", DEP_EXP)
CHRON = try_load("Dependencies", DEP_CHRON)


# ---------------- quick preview ----------------
print("\nPreview AUC (first 5 drugs × 5 lines):")
display(DEP_PRISM.iloc[:5, :5])

if annot is not None:
    print("\nPreview annotations (first 10):")
    display(annot.head(10))


  return pd.read_csv(path)


PRISM (drugs x cell_lines): (1449, 33) -> out_modules_notebook/PRISM_AUC_PDAC_drugs_by_celllines.csv
Expression: (51, 19099) from /Users/scottpowers/Library/CloudStorage/GoogleDrive-Scott.Powers@stonybrook.edu/My Drive/Imputed Folder/PDAC_celllines_expression.csv
Dependencies: (45, 17917) from /Users/scottpowers/Library/CloudStorage/GoogleDrive-Scott.Powers@stonybrook.edu/My Drive/Imputed Folder/PDAC_gene_dependencies.csv

Preview AUC (first 5 drugs × 5 lines):


ccle_name,0,1,2,3,4
compound,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ccle_name,ASPC1_PANCREAS,BXPC3_PANCREAS,CAPAN2_PANCREAS,CFPAC1_PANCREAS,DANG_PANCREAS
1-azakenpaullone,1.422871,1.442328,1.1974,1.143525,0.847678
1-naphthyl-PP1,0.868061,1.004497,0.865685,0.861098,
1-phenylbiguanide,0.943366,1.345432,0.884344,0.954388,0.907823
10-deacetylbaccatin,0.887246,,0.858276,0.889449,0.747788



Preview annotations (first 10):


Unnamed: 0_level_0,target,moa
compound,Unnamed: 1_level_1,Unnamed: 2_level_1
cytarabine,"POLA1, POLB, POLD1, POLE",ribonucleotide reductase inhibitor
epinastine,"ADRA1A, ADRA2A, HRH1, HRH2, HTR2A, HTR7",histamine receptor antagonist
floxuridine,TYMS,DNA synthesis inhibitor
valrubicin,TOP2A,"DNA inhibitor, topoisomerase inhibitor"
adapalene,"RARA, RARB, RARG, RXRA, RXRB, RXRG",retinoid receptor agonist
colforsin-daproate,,adenylyl cyclase activator
sulfamethazine,,PABA antagonist
niridazole,,phosphofructokinase inhibitor
amprolium,,thiamine uptake blocker
methylphenidate,"SLC6A2, SLC6A3, SLC6A4",dopamine-norepinephrine reuptake inhibitor


## 2) Define module gene sets
Replace the placeholders with your finalized core gene lists. You can also load from CSVs.

## 3) Utilities

In [36]:

from __future__ import annotations
import numpy as np
from typing import Dict, List
from scipy import stats

# Optional packages
try:
    import gseapy as gp
except Exception:
    gp = None

def bh_fdr(p: pd.Series) -> pd.Series:
    p = pd.Series(p).astype(float)
    n = p.notna().sum()
    order = p.sort_values().index
    ranks = pd.Series(range(1, n + 1), index=order)
    q = p.copy()
    q.loc[order] = (p.loc[order] * n / ranks).cummin().clip(upper=1.0)
    return q

def zscore_mat(M: pd.DataFrame, axis: int = 0) -> pd.DataFrame:
    return M.apply(lambda x: (x - x.mean()) / (x.std(ddof=0) + 1e-9), axis=axis)

def intersect_genes(genes: List[str], ref_index: pd.Index) -> List[str]:
    s = pd.Index(genes).dropna().astype(str)
    inter = s.intersection(ref_index.astype(str))
    return inter.tolist()

def module_scores_ssgsea(expr: pd.DataFrame, modules: Dict[str, List[str]], force_mean_z: bool=False) -> pd.DataFrame:
    \"\"\"expr: genes x samples -> returns modules x samples.\"\"\"
    if force_mean_z or gp is None:
        Z = zscore_mat(expr, axis=1)
        out = {}
        for m, g in modules.items():
            keep = intersect_genes(g, Z.index)
            out[m] = Z.loc[keep].mean(axis=0)
        return pd.DataFrame(out).T

    x = expr.T
    ss = gp.ssgsea(data=x, gene_sets=modules, sample_norm_method=None, outdir=None, processes=4, format="png")
    df = pd.DataFrame({k: pd.Series(v) for k, v in ss.res2dn.items()}).T
    return df

def corr_against_matrix(module_scores: pd.Series, M: pd.DataFrame, method: str = "spearman") -> pd.DataFrame:
    common = module_scores.index.intersection(M.columns)
    if len(common) < 5:
        raise ValueError("Too few overlapping samples for correlation.")
    y = module_scores.loc[common]
    X = M.loc[:, common]

    rs, ps, feats = [], [], []
    for feat, row in X.iterrows():
        feats.append(feat)
        if method == "spearman":
            r, p = stats.spearmanr(row.values, y.values, nan_policy='omit')
        else:
            r, p = stats.pearsonr(row.values, y.values)
        rs.append(r); ps.append(p)
    out = pd.DataFrame({"rho": rs, "p": ps}, index=feats)
    out["q"] = bh_fdr(out["p"])
    return out.sort_values("rho", ascending=False)


SyntaxError: unexpected character after line continuation character (2253102615.py, line 30)

## 4) DepMap integration

In [5]:
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path
import re

# ==================== CONFIG ====================
BASE = Path("~/Desktop").expanduser()
EXPR  = BASE / "PDAC_celllines_expression.csv"     # may be genes×lines OR lines×genes
PRISM = BASE / "PRISM_AUC_PDAC_matrix.csv"         # drugs × cell lines (IDs or names)
OUTDIR = BASE / "prism_module_from_expression_out"
OUTDIR.mkdir(exist_ok=True, parents=True)

# === DepMap_ID → human-readable cell line name (PDAC panel) ===
CELLNAME_MAP = {
    "ACH-000022": "PATU8988S",
    "ACH-000023": "PATU8988T",
    "ACH-000094": "HPAFII",
    "ACH-000108": "KP3",
    "ACH-000114": "SU8686",
    "ACH-000118": "HUPT3",
    "ACH-000138": "CFPAC1",
    "ACH-000178": "HS766T",
    "ACH-000205": "PK59",
    "ACH-000213": "HUPT4",
    "ACH-000222": "ASPC1",
    "ACH-000265": "KP4",
    "ACH-000307": "PK1",
    "ACH-000332": "YAPC",
    "ACH-000354": "CAPAN1",
    "ACH-000502": "TCCPAN2",
    "ACH-000517": "SNU410",
    "ACH-000652": "SUIT2",
    "ACH-000685": "L33",
    "ACH-001376": "PACADD135",
    "ACH-001379": "PACADD161",
    "ACH-001380": "PACADD165",
    "ACH-001382": "PACADD188",
    "ACH-002039": "PK8",
    "ACH-003161": "ABMT9430",
    "ACH-003433": "CCLFPANC0019T",
}
NAME_TO_ID = {v: k for k, v in CELLNAME_MAP.items()}
ACH_RE = re.compile(r"^ACH-\d{6}$", re.I)

# --- Define module gene sets (UP/DOWN) ---
SAT_UP = [
    "ABHD8","AC004870.4","AC005920.1","AC009041.1","AC009309.1","AC011498.1","AC012447.1","AC018521.5",
    "AC018754.1","AC027237.2","AC068338.2","AC072061.1","AC079305.3","AC079807.1","AC087623.2","AC090403.1",
    "AC091271.1","AC092287.1","AC092910.3","AC093323.1","AC099778.1","AC107959.2","AC125611.3","AC144652.1",
    "AC239799.2","AC253572.2","ACBD7","AFMID","AHCY","AL021155.5","AL022069.1","AL031963.3","AL049869.2",
    "AL121574.1","AL133523.1","AL139106.1","AL139246.5","AL355075.4","AL360012.1","AL365436.2","AL592295.5",
    "AL662844.4","AP001160.1","AP002381.2","AP002813.1","ARL4D","ATP2B1-AS1","ATRIP","BAMBI","BHLHE40-AS1",
    "BOLA1","BUD23","C12orf65","C19orf48","C2CD4B","C6orf120","CABYR","CCDC9","CCNE2","CDKN2AIP","CHCHD7",
    "CITED2","CROCC","CSKMT","CTH","DALRD3","DRAIC","DUSP28","EAF2","EIF4A3","FOXA3","FOXL1","GADD45B","GLA",
    "GOT1","GRPEL1","GTF2A1","GTF2B","HEXIM1","HIST1H2AG","HIST1H2AH","HIST1H2AL","HIST1H2BJ","HIST1H2BN",
    "HIST1H3A","HIST1H3J","HIST1H4A","HIST1H4C","HIST1H4E","HIST2H2AC","HIST2H3PS2","HIST3H2A","HIST4H4","HMBS",
    "HSPA2","ID2","IDI1","ING1","KCTD5","KIF9","KLHL11","LAP3","LIFR-AS1","LINC01970","LINC02029","LINC02363",
    "LRG1","LRTOMT","MAFB","MED29","MEPCE","MIR17HG","MORF4L2-AS1","MTHFD2","MYCL","MYOSLID","NANOS1","NPW",
    "NRARP","OAT","OSER1-DT","OSGIN1","PHYH","PICART1","PIEZO1","PIK3R3","PLIN5","PLK2","PMAIP1","PMEL","PNKD",
    "POU3F1","PPP1R3C","PRMT5-AS1","PRR3","PTCH2","PTPN6","RAB26","RALY-AS1","RASL11A","RND1","RNF223","RUVBL2",
    "SAE1","SENP8","SIAH2-AS1","SIRT2","SLC7A5","SNHG12","SNHG5","SNHG8","SREBF2-AS1","SRSF7","STARD5","TBPL1",
    "TCTA","THAP9","TLCD1","TM7SF2","TMEM107","TMEM171","TMEM69","TNFRSF10D","TNK1","TRAM2-AS1","TTC33","UAP1",
    "UBAC2-AS1","UBE2D3-AS1","UBE2S","UGDH","WDR74","Z93241.1","Z99127.4","ZC3H10","ZCWPW1","ZFAS1","ZFX-AS1",
    "ZNF574","ZNF584","ZNF622","ZNF687-AS1","ZNF844","ZNF92","ZSWIM3"
]
SAT_DOWN = [
    "PAX5","AC117386.2","PRSS55","RPS16","GDF7","PAK4","AC022144.1","AC092745.5","AL670729.3",
    "DLEU2L","ELP3","KCNC2","MAP4K1","AL161729.4","SV2C","RGS11","AC005498.1","WFDC5","PSENEN",
    "LINC01956","AC115485.1","CYSLTR2","ASMTL-AS1","AP002001.3","FAM153B"
]
IGE_UP = [
    "PFN2","SET","LGALS1","PDLIM4","EIF5A","F8A1","BANF1","CTDNEP1","TRAPPC1","AMZ2","PTGES3","ST13",
    "FTH1","PCBP1","NAA10","ARPC5L","ETFB","CCDC124","SSR2","PGLS","SELENOH","GSTP1","NDUFB7","AURKAIP1",
    "GCHFR","MYL6","AP2S1","S100A13","C9ORF16","KRT8","PRKCSH","CST3","PET100","DNAJA2","VPS35","VDAC1",
    "MGST3","PRR13","OCIAD2","FUOM","MIA","RPN1","FUCA2","TMED4","ERGIC3","DDOST","NAXE","SNRPD1","SKP1",
    "CNBP","ATP5MC3","SLC25A5","LSM4","NDUFAB1","PCBD1","HINT1","ADI1","NENF","MRPL43","VKORC1","EMC4",
    "COA3","NEDD8","CHCHD2","PRDX5","WDR83OS","NDUFA4","GABARAP","PHPT1","UFC1","MDH1","ATP5F1E","MRPL41",
    "UBL5","COX5B","ELOB","ATP5ME","UQCRB","SEM1","NDUFB1","COX6C","ATP6V1F","HNRNPA1","HIGD2A","POLR2I",
    "METTL26","NDUFB4","OST4","C19ORF53","RPL36AL","PPIA","COPS9","COX8A","COX5A","RPL36","RPS15","RPS27",
    "RPS11","FTL","SNHG29","NACA","RPS17","RPL23","RPL39","RPS21","RPL9","RPL15","RPL34","RPL27A","RPL36A",
    "RPL38","RPS26","RPS14","RPL35","RPL37A","RPL37","RPL12","RPS29"
]
IGE_DOWN = [
    "STT3A","SGPP1","LINGO1","ASS1","CETN2","HNRNPH1","STN1","NRG4","IQSEC2","IYD","CHD9","APBB1IP",
    "TMEM238","REX1BD","IFI27","CYBA","METRNL","AKR7A2","SULT1A1","ARHGEF35","AC008397.1","KLK11",
    "DELE1","MISP","FAM234A","CMBL"
]
IL2_UP = [
    "ABHD15-AS1","AC007780.1","AC008105.3","AC008964.1","AC016831.1","AC016831.5","AC090772.1","AC108863.2",
    "ADARB1","ALOX5AP","ANKH","ANKRD44","APOBEC3G","ARHGDIB","ARL6IP5","ARSG","ATP10D","BCL11B","BORCS5",
    "CAMK1D","CD4","CD84","CD96","CDC42EP3","CELF2","CERS4","CLEC2D","CNOT6L","CRYBG1","CTSW","DSE","FGD3",
    "FMNL1","FOXN3","FOXO1","FYB1","GFI1","GNAO1","GPRIN3","HOPX","IGF1","IKZF3","IL18R1","INKA2","INSYN2B",
    "IQSEC1","ITPRIPL1","JAK3","KLRC2","KLRC3","KLRF1","KLRG1","LAPTM5","LCP1","LEPROTL1","LINC00513",
    "LINC01237","MAN1A1","MAPRE2","MPHOSPH9","MPP7","MVB12B","MYO5A","NIN","PARP11","PARP15","PCED1B-AS1",
    "PDE3B","PIP4K2A","PLCL1","PLEKHA2","PPP3CC","PRKCH","PRKCQ","PRKD3","PRKX","PTPN22","RAC2","RASGRF2",
    "RFX3","RIPOR2","RNF166","S1PR4","SAMD3","SENP7","SH2B3","SH2D2A","SMARCA2","SNHG26","SPOCK2","SRGN",
    "ST8SIA1","STAT5A","STAT5B","STIM1","STK17A","TMEM200A","TMX4","TRBC2","TRDC","TRIM22","TTN","VAV3",
    "WNT5A-AS1","ZNF101","ZNF471"
]
IL2_DOWN = [
    "RBP7","OVCA2","PLAC4","AC026785.3","LINC02212","LINC00605","AC246817.2","FOXCUT","VGLL2","ZIC4","FOLR3",
    "ECEL1","AC024337.1","C5orf58","AC060814.3","B4GALNT1","UCHL1","VAX1","AL451042.1","COMMD8","IFI30",
    "AL096794.1","RGS10"
]
MPC_UP = [
    "KIAA1211L","CASC4","AHR","YY1AP1","COPA","RELL1","FBXW2","CARMIL1","NUBPL","ZC3H18","AGK","HLCS",
    "TMEM241","URGCP","CADPS2","COBL","ARHGAP42","AC138305.1","FAM135A","SLC41A2","TACC2","MCU","FMN1",
    "LGR4","BAIAP2L1","SGPP2","RGS12","PTPN3","IBTK","SPRY4-AS1","UGT8","PNN","SP140L","PIK3CA","CNOT2",
    "ZFC3H1","UBE2K","STRN3","STAG2","KDM5A","RC3H2","TP53BP1","SLC9A8","ATF7IP","MLLT10","SCAPER","CPSF6",
    "KLC1","HUWE1","TAOK3","ROCK1","MAN2A1","ZNF44","ARGLU1","EHMT1","HTT","SIN3A","RSBN1L","MLLT3","RABGAP1",
    "SCFD1","NEMF","DMXL1","RCOR3","NAA35","WWP1","VPS41","PRRC2B","STX16","ANKRD11","ARHGEF7","SOS2","SLAIN2",
    "LUC7L2","SRPK2","TYW1","HMBOX1","LMBR1","TCF12","MBD5","MON2","COG5","LONP2","RAB3GAP2","LARP4B","NUMB",
    "NF1","NCOA2","ZNF710","CSNK1G1","WDR37","NUTM2B-AS1","ITFG1","UBE3C","MFSD14C","SPG11","PTK2","LPP",
    "ZBTB20","FCHO2","PTPN12","DCUN1D4","KDM7A","SLMAP","KIF13B","MIB1","DIP2C","LRCH1","TNIK","TNKS","SMC5",
    "ANKIB1","RBM33","TNPO3","OSBPL3","RAPGEF2","MED13L","FBXL20","KIAA0232","ITCH","MARCH6","ARID4B","PLEKHA6",
    "FNBP1L","KIAA1217","FARP2","MAGI3","DIAPH2","VPS13A","DGKH","GNAQ","ARHGEF12","MYO1D","CDC42BPA","TTC7A",
    "TRIM44","NSD3","NCOA3","ADNP","RUFY3","RUNX1","TRRAP","PTBP2","ZNF609","CHD7","PARP8","ARIH1","ZHX2",
    "ETV6","CUL1","CDK13","BRAF","MBTD1","AUH","STX17","POLR2J3","KLHDC10","TAF1","TOGARAM1","WRN","ADK",
    "TASOR2","NUDCD3","TBL1X","MOSMO","USP42","CRCP","GALNT11","DDI2","TTC14","UPF2","PDXDC1","ETFDH","SEL1L3",
    "HEATR5A","PSMD5","TBC1D2B","NUP214","GAPVD1","RNF19A","SMURF2","NRIP1","WDFY2","TBK1","LCORL","USP25",
    "RABGEF1","CASD1","RBPJ","AEBP2","MALAT1","MAML2","RBMS1","CRIM1","SNX29","PPFIBP1","SVIL","DDX24","G2E3",
    "RASA1","FAM160A1","RNF24","ESYT2","EPS8","SESTD1","ATP2B4","PELI1","RNF145","B4GALT5","PPP2R2A","NOS1AP",
    "EGFR","CPNE8","ARHGAP21","SMAD3","DAPK1","IGF1R","AFAP1","KLF7","DOCK5","MINDY2","ZXDC","NEDD4","METTL15",
    "FNIP2","NEAT1","CELF1","DANT2","SYT17","TMEM245","ERICH1","ADPGK","LMBRD1","MFSD11","GOLGA2","SCNN1A",
    "XDH","PLEKHA1","PPP2CB","GTF2E2","KCNK1","SPATS2L","RTF1","UACA","VAV2","MDFIC","STAM","CLIP4","KYAT1",
    "MECP2","NUP160","THOC1","LINC-PINT","MCPH1","DISC1","UBA6-AS1","AAK1","NRF1","PHF20","RNF216","PCM1","SAFB2",
    "FAM133B","NR6A1","ATL2","C1orf21","MTUS1","PARD3B","EXT1","ST5","ABI2","INTS6L","TRIM56","PTER","PRR12",
    "RSPH3","TMC5","MECOM","ARHGEF10L","HNF4G","PPM1H","AP005230.1","AC119674.1","ZSCAN18","NOL4L","PDPK1",
    "RSU1","TTC39C","COL27A1","SEC16B","AC005162.3","RASAL1","IL17RA","SPART-AS1","CDC37","MUCL3","TMEM178B",
    "LINC02614","EREG","ELK3","TMTC1","PALM2-AKAP2","RAB31","EMP3"
]
MPC_DOWN = [
    "MRPS15","TALDO1","MDH2","PSMB5","PIK3CD-AS2","DUSP9","POPDC3","AKR1C3","MRPS18A","EIF3H","UQCRC2",
    "IFT22","RAB11B","MBOAT7"
]

MODULES = {
    "SAT": (SAT_UP, SAT_DOWN),
    "IGE": (IGE_UP, IGE_DOWN),
    "IL2": (IL2_UP, IL2_DOWN),
    "MPC": (MPC_UP, MPC_DOWN),
}

# ==================== HELPERS ====================
def zscore_rows(df):
    mu = df.mean(axis=1)
    sd = df.std(axis=1, ddof=0).replace(0, np.nan)
    return df.sub(mu, axis=0).div(sd, axis=0)

def score_module(Z, up_genes, down_genes):
    up = [g for g in up_genes if g in Z.index]
    down = [g for g in down_genes if g in Z.index]
    if len(up)==0 or len(down)==0:
        return pd.Series(np.nan, index=Z.columns)
    return Z.loc[up].mean(axis=0) - Z.loc[down].mean(axis=0)

def hedges_g(x, y):
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2: return np.nan
    sp2 = ((nx-1)*np.var(x, ddof=1)+(ny-1)*np.var(y, ddof=1)) / (nx+ny-2)
    if sp2 <= 0 or np.isnan(sp2): return np.nan
    d = (x.mean()-y.mean())/np.sqrt(sp2)
    J = 1 - (3/(4*(nx+ny)-9))
    return d*J

def cliffs_delta(x,y):
    x = x.reshape(-1,1); y = y.reshape(1,-1)
    return ((x>y)-(x<y)).sum()/(x.size*y.size)

def bh_fdr(p):
    p = np.asarray(p, float); n = len(p)
    order = np.argsort(p); ranked = np.empty_like(p); prev = 1
    for i, idx in enumerate(order[::-1], start=1):
        rank = n - i + 1
        val = min(prev, p[idx] * n / rank)
        ranked[idx] = val; prev = val
    return ranked

def map_any_to_ids(labels):
    out = []
    for c in labels:
        c = str(c).strip()
        if ACH_RE.match(c):              # already an ID
            out.append(c.upper())
        elif c in NAME_TO_ID:            # name -> ID
            out.append(NAME_TO_ID[c])
        else:
            out.append(c)                # leave as-is
    return out

def ids_to_names(labels):
    return [CELLNAME_MAP.get(str(c).strip(), str(c).strip()) for c in labels]

def is_gene_like_list(vals):
    vals = [str(v) for v in vals]
    pat = re.compile(r"^[A-Za-z0-9][A-Za-z0-9\.\-]{1,19}$")
    hits = sum(bool(pat.match(v)) for v in vals)
    return hits / max(1, len(vals)) > 0.7

def is_cellline_like_list(vals):
    vals = [str(v) for v in vals]
    ach = sum(bool(ACH_RE.match(v)) for v in vals)
    names = sum(v in NAME_TO_ID for v in vals)
    return (ach + names) / max(1, len(vals)) > 0.3

# ==================== 1) LOAD EXPRESSION & FIX ORIENTATION ====================
expr_raw = pd.read_csv(EXPR, index_col=0)
idx, cols = list(expr_raw.index), list(expr_raw.columns)

rows_look_like_cells = is_cellline_like_list(idx)
cols_look_like_cells = is_cellline_like_list(cols)
rows_look_like_genes = is_gene_like_list(idx)
cols_look_like_genes = is_gene_like_list(cols)

# Decide orientation: want genes × cell lines (columns are lines)
if cols_look_like_cells and rows_look_like_genes:
    expr = expr_raw.copy()
elif rows_look_like_cells and cols_look_like_genes:
    expr = expr_raw.T.copy()   # transpose to genes × lines
else:
    # fallback: if columns count < rows count and columns look like cells, keep; else transpose
    expr = expr_raw if (len(cols) < len(idx) and cols_look_like_cells) else expr_raw.T.copy()

# Normalize cell-line columns to DepMap_IDs
expr.columns = pd.Index(map_any_to_ids(expr.columns), dtype=str)
# Collapse duplicate IDs if any (mean)
expr = (expr.T.groupby(level=0).mean(numeric_only=True)).T

# Clean genes
expr.index = pd.Index([str(g).upper().strip() for g in expr.index], dtype=str)
print(f"[EXPR] genes × lines: {expr.shape}")

Z = zscore_rows(expr)

# ==================== 2) COMPUTE MODULE SCORES ====================
module_scores = pd.DataFrame(index=expr.columns)
for mod, (up, down) in MODULES.items():
    module_scores[mod] = score_module(Z, up, down)

module_scores_named = module_scores.copy()
module_scores_named.index = ids_to_names(module_scores_named.index)
module_scores_named.to_csv(OUTDIR / "module_scores_with_names.csv")
print("[MODULE SCORES] saved module_scores_with_names.csv")

# ==================== 3) LOAD PRISM (drugs × lines) & NORMALIZE ====================
prism = pd.read_csv(PRISM, index_col=0)
prism.columns = pd.Index(map_any_to_ids(prism.columns), dtype=str)
prism = prism.loc[:, ~prism.columns.duplicated()]  # just in case

# Intersect on DepMap_IDs
shared = sorted(set(prism.columns).intersection(module_scores.index))
pd.DataFrame({"DepMap_ID": shared, "cell_line_name": ids_to_names(shared)}).to_csv(
    OUTDIR / "shared_cell_lines.csv", index=False
)

prism = prism[shared]
module_scores = module_scores.loc[shared]
print(f"[PRISM] {prism.shape[0]} drugs × {prism.shape[1]} shared PDAC lines")

# ==================== 4) ASSOCIATION TESTS ====================
def run_assoc(mod, s, prism_mat, shared_ids):
    lo, hi = s.quantile(1/3), s.quantile(2/3)
    low_ids  = s.index[s <= lo]
    high_ids = s.index[s >= hi]

    results = []
    for drug in prism_mat.index:
        y = prism_mat.loc[drug, shared_ids].astype(float)
        x = s.loc[shared_ids].astype(float)
        ok = (~y.isna()) & (~x.isna())
        if ok.sum() < 6:
            continue

        r, p_spear = stats.spearmanr(x[ok], y[ok])

        yl = prism_mat.loc[drug, low_ids].dropna().values
        yh = prism_mat.loc[drug, high_ids].dropna().values
        p_u = g_eff = d_eff = np.nan
        if (yl.size >= 4) and (yh.size >= 4):
            try:
                _, p_u = stats.mannwhitneyu(yl, yh, alternative="two-sided")
                g_eff = hedges_g(yh, yl)
                d_eff = cliffs_delta(yh, yl)
            except Exception:
                pass

        results.append([drug, r, p_spear, p_u, g_eff, d_eff])

    if not results:
        return pd.DataFrame(columns=["spearman_rho","spearman_p","spearman_q","hedges_g","u_p","u_q"])

    out = pd.DataFrame(results, columns=["drug","spearman_rho","spearman_p","u_p","hedges_g","cliffs_delta"]).set_index("drug")
    # FDRs
    out["spearman_q"] = bh_fdr(out["spearman_p"].fillna(1))
    out["u_q"]        = bh_fdr(out["u_p"].fillna(1))
    out = out.sort_values("spearman_p")
    return out

for mod in MODULES.keys():
    out = run_assoc(mod, module_scores[mod], prism, shared)
    out.to_csv(OUTDIR / f"prism_{mod}_association.csv")
    print(f"[{mod}] top 5 (Spearman):")
    print(out.head(5)[["spearman_rho","spearman_p","spearman_q","hedges_g","u_p","u_q"]])

print("✅ Done. Results written to:", OUTDIR)


[EXPR] genes × lines: (19098, 51)
[MODULE SCORES] saved module_scores_with_names.csv
[PRISM] 33 drugs × 0 shared PDAC lines
[SAT] top 5 (Spearman):
Empty DataFrame
Columns: [spearman_rho, spearman_p, spearman_q, hedges_g, u_p, u_q]
Index: []
[IGE] top 5 (Spearman):
Empty DataFrame
Columns: [spearman_rho, spearman_p, spearman_q, hedges_g, u_p, u_q]
Index: []
[IL2] top 5 (Spearman):
Empty DataFrame
Columns: [spearman_rho, spearman_p, spearman_q, hedges_g, u_p, u_q]
Index: []
[MPC] top 5 (Spearman):
Empty DataFrame
Columns: [spearman_rho, spearman_p, spearman_q, hedges_g, u_p, u_q]
Index: []
✅ Done. Results written to: /Users/scottpowers/Desktop/prism_module_from_expression_out


In [6]:
import re, pandas as pd
from pathlib import Path
BASE = Path("~/Desktop").expanduser()

EXPR  = pd.read_csv(BASE/"PDAC_celllines_expression.csv", index_col=0)
PRISM = pd.read_csv(BASE/"PRISM_AUC_PDAC_matrix.csv",    index_col=0)

ACH = re.compile(r"^ACH-\d{6}$", re.I)
expr_cols = list(map(str, EXPR.columns))
expr_idx  = list(map(str, EXPR.index))
prism_cols= list(map(str, PRISM.columns))

print("[expr] shape", EXPR.shape)
print("[prism] shape", PRISM.shape)

print("expr columns look like ACH?:", sum(bool(ACH.match(c)) for c in expr_cols))
print("expr index   look like genes?:", sum(bool(re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9.\-]{1,19}", g)) for g in expr_idx))

print("prism columns ACH?:", sum(bool(ACH.match(c)) for c in prism_cols))

shared = sorted(set(c for c in expr_cols if ACH.match(c)).intersection(c for c in prism_cols if ACH.match(c)))
print("shared ACH IDs:", len(shared), shared[:10])


[expr] shape (51, 19098)
[prism] shape (33, 1448)
expr columns look like ACH?: 0
expr index   look like genes?: 51
prism columns ACH?: 0
shared ACH IDs: 0 []


In [7]:
import pandas as pd
from pathlib import Path
from glob import glob

# --- expr already in memory (genes x ACH columns) ---
print("[expr]", expr.shape)
print("expr ACH head:", expr.columns[:5].tolist())

HOME = Path.home()

# ---- Locate long-format PRISM file ----
long_candidates = [
    Path("PRISM_AUC_PDAC_long.csv"),
    HOME/"Desktop/PRISM_AUC_PDAC_long.csv",
    HOME/"Downloads/PRISM_AUC_PDAC_long.csv",
]
if not any(p.exists() for p in long_candidates):
    found = glob(str(HOME / "**/PRISM_AUC_PDAC_long.csv"), recursive=True)
    if found: long_candidates.insert(0, Path(found[0]))
PRISM_LONG = next((p for p in long_candidates if p.exists()), None)
assert PRISM_LONG is not None, "Could not find PRISM_AUC_PDAC_long.csv."

print(f"[prism long path] {PRISM_LONG}")

# ---- Read long file ----
long = pd.read_csv(PRISM_LONG)
print("[long] shape:", long.shape)
print("[long] columns:", long.columns.tolist()[:12])

# Identify column names (robust to variants)
cols = {c.lower(): c for c in long.columns}

ccle_col = cols.get("ccle_name") or cols.get("cclename") or cols.get("ccle") or cols.get("cell_line")
assert ccle_col, f"Need CCLE name column; available: {long.columns.tolist()}"

# Drug identifier: prefer 'compound'/'name'/'drug'/'broad_id'
drug_col = (cols.get("compound") or cols.get("drug") or cols.get("name") or 
            cols.get("pert_iname") or cols.get("broad_id") or cols.get("brd_id"))
assert drug_col, f"Need a drug/compound column; available: {long.columns.tolist()}"

# AUC value column (try common names)
auc_col = (cols.get("auc") or cols.get("area_under_curve") or cols.get("auc_mean") or 
           cols.get("auc_value") or cols.get("auc_avg"))
assert auc_col, f"Need an AUC column; available: {long.columns.tolist()}"

# Keep only what's needed
keep = long[[ccle_col, drug_col, auc_col]].copy()
keep = keep.dropna(subset=[ccle_col, drug_col, auc_col])
print("[long] non-null rows:", keep.shape[0])

# Pivot to matrix (cell lines x drugs), averaging duplicates
prism = (
    keep.groupby([ccle_col, drug_col], as_index=False)[auc_col]
        .mean()
        .pivot(index=ccle_col, columns=drug_col, values=auc_col)
)
print("[prism matrix] shape:", prism.shape)
print("index head:", prism.index[:5].tolist())
print("cols head:", prism.columns[:5].tolist())

# ---- Load DepMap model info for CCLE -> ACH mapping ----
model_candidates = [
    Path("Model.csv"), Path("sample_info.csv"),
    HOME/"Desktop/Model.csv", HOME/"Desktop/sample_info.csv",
    HOME/"Downloads/Model.csv", HOME/"Downloads/sample_info.csv",
]
if not any(p.exists() for p in model_candidates):
    found = glob(str(HOME / "**/Model.csv"), recursive=True) + glob(str(HOME / "**/sample_info.csv"), recursive=True)
    if found: model_candidates.insert(0, Path(found[0]))
MODEL = next((p for p in model_candidates if p.exists()), None)
assert MODEL is not None, "Could not find Model.csv or sample_info.csv."

info = pd.read_csv(MODEL)
icol = {c.lower(): c for c in info.columns}
ach_col = icol.get("ach_id") or icol.get("modelid") or icol.get("model_id")
ccle_info_col = icol.get("ccle_name") or icol.get("cclename")
assert ach_col and ccle_info_col, f"Model file must include CCLE_Name and ACH_ID/ModelID; found: {info.columns.tolist()}"

map_ccle_to_ach = dict(zip(info[ccle_info_col], info[ach_col]))

# Some CCLEs may have tissue suffix (e.g., ASPC1_PANCREAS)
def strip_suffix(name):
    return name.rsplit("_", 1)[0] if isinstance(name, str) and "_" in name else name

# Try direct mapping, else try stripped suffix
prism1 = prism.rename(index=map_ccle_to_ach)
shared1 = expr.columns.intersection(prism1.index)

if len(shared1) < 10:
    prism2 = prism.copy()
    prism2.index = prism2.index.map(strip_suffix)
    prism2 = prism2.rename(index=map_ccle_to_ach)
    shared2 = expr.columns.intersection(prism2.index)
    if len(shared2) >= len(shared1):
        prism_mapped, shared = prism2, shared2
        used = "stripped CCLE suffix"
    else:
        prism_mapped, shared = prism1, shared1
        used = "direct CCLE"
else:
    prism_mapped, shared = prism1, shared1
    used = "direct CCLE"

print(f"[map] method: {used}")
print(f"[align] shared ACH with expr: {len(shared)}")
print("shared head:", shared[:10].tolist())

# ---- Align and save ----
expr_aln  = expr.loc[:, shared].copy()
prism_aln = prism_mapped.loc[shared, :].copy()

print("[expr_aln]", expr_aln.shape, "| [prism_aln]", prism_aln.shape)

OUTDIR = Path("out_prism_align"); OUTDIR.mkdir(exist_ok=True)
expr_aln.to_csv(OUTDIR / "expr_genesXACH_shared.csv")
prism_aln.to_csv(OUTDIR / "prism_ACHXdrugs_shared.csv")

audit = pd.DataFrame({
    "prism_rows_total":[prism.shape[0]],
    "shared_ACH":[len(shared)],
    "expr_genes":[expr.shape[0]],
    "expr_samples":[expr.shape[1]],
    "prism_drugs":[prism_aln.shape[1]],
    "mapping_method":[used],
})
audit.to_csv(OUTDIR / "alignment_audit.csv", index=False)
audit


[expr] (19098, 51)
expr ACH head: ['ACH-000022', 'ACH-000023', 'ACH-000031', 'ACH-000042', 'ACH-000060']
[prism long path] /Users/scottpowers/Desktop/PRISM_AUC_PDAC_long.csv
[long] shape: (47087, 4)
[long] columns: ['depmap_id', 'ccle_name', 'compound', 'auc']
[long] non-null rows: 47087
[prism matrix] shape: (33, 1448)
index head: ['ASPC1_PANCREAS', 'BXPC3_PANCREAS', 'CAPAN2_PANCREAS', 'CFPAC1_PANCREAS', 'DANG_PANCREAS']
cols head: ['1-azakenpaullone', '1-naphthyl-PP1', '1-phenylbiguanide', '10-deacetylbaccatin', '10-hydroxycamptothecin']
[map] method: direct CCLE
[align] shared ACH with expr: 30
shared head: ['ACH-000022', 'ACH-000023', 'ACH-000042', 'ACH-000060', 'ACH-000107', 'ACH-000118', 'ACH-000138', 'ACH-000139', 'ACH-000155', 'ACH-000164']
[expr_aln] (19098, 30) | [prism_aln] (30, 1448)


Unnamed: 0,prism_rows_total,shared_ACH,expr_genes,expr_samples,prism_drugs,mapping_method
0,33,30,19098,51,1448,direct CCLE


In [8]:
import pandas as pd, numpy as np, re
from pathlib import Path
from collections.abc import Iterable

# expr already in memory (rows=genes, cols=ACH)
print("[expr]", expr.shape)

# ==== Paste your gene lists below ====
# You can use EITHER strings (one per line/commas ok) OR Python lists like ['GATA6','KRT17',...]
SAT_UP = [
    "ABHD8","AC004870.4","AC005920.1","AC009041.1","AC009309.1","AC011498.1","AC012447.1","AC018521.5",
    "AC018754.1","AC027237.2","AC068338.2","AC072061.1","AC079305.3","AC079807.1","AC087623.2","AC090403.1",
    "AC091271.1","AC092287.1","AC092910.3","AC093323.1","AC099778.1","AC107959.2","AC125611.3","AC144652.1",
    "AC239799.2","AC253572.2","ACBD7","AFMID","AHCY","AL021155.5","AL022069.1","AL031963.3","AL049869.2",
    "AL121574.1","AL133523.1","AL139106.1","AL139246.5","AL355075.4","AL360012.1","AL365436.2","AL592295.5",
    "AL662844.4","AP001160.1","AP002381.2","AP002813.1","ARL4D","ATP2B1-AS1","ATRIP","BAMBI","BHLHE40-AS1",
    "BOLA1","BUD23","C12orf65","C19orf48","C2CD4B","C6orf120","CABYR","CCDC9","CCNE2","CDKN2AIP","CHCHD7",
    "CITED2","CROCC","CSKMT","CTH","DALRD3","DRAIC","DUSP28","EAF2","EIF4A3","FOXA3","FOXL1","GADD45B","GLA",
    "GOT1","GRPEL1","GTF2A1","GTF2B","HEXIM1","HIST1H2AG","HIST1H2AH","HIST1H2AL","HIST1H2BJ","HIST1H2BN",
    "HIST1H3A","HIST1H3J","HIST1H4A","HIST1H4C","HIST1H4E","HIST2H2AC","HIST2H3PS2","HIST3H2A","HIST4H4","HMBS",
    "HSPA2","ID2","IDI1","ING1","KCTD5","KIF9","KLHL11","LAP3","LIFR-AS1","LINC01970","LINC02029","LINC02363",
    "LRG1","LRTOMT","MAFB","MED29","MEPCE","MIR17HG","MORF4L2-AS1","MTHFD2","MYCL","MYOSLID","NANOS1","NPW",
    "NRARP","OAT","OSER1-DT","OSGIN1","PHYH","PICART1","PIEZO1","PIK3R3","PLIN5","PLK2","PMAIP1","PMEL","PNKD",
    "POU3F1","PPP1R3C","PRMT5-AS1","PRR3","PTCH2","PTPN6","RAB26","RALY-AS1","RASL11A","RND1","RNF223","RUVBL2",
    "SAE1","SENP8","SIAH2-AS1","SIRT2","SLC7A5","SNHG12","SNHG5","SNHG8","SREBF2-AS1","SRSF7","STARD5","TBPL1",
    "TCTA","THAP9","TLCD1","TM7SF2","TMEM107","TMEM171","TMEM69","TNFRSF10D","TNK1","TRAM2-AS1","TTC33","UAP1",
    "UBAC2-AS1","UBE2D3-AS1","UBE2S","UGDH","WDR74","Z93241.1","Z99127.4","ZC3H10","ZCWPW1","ZFAS1","ZFX-AS1",
    "ZNF574","ZNF584","ZNF622","ZNF687-AS1","ZNF844","ZNF92","ZSWIM3"
]
SAT_DOWN = [
    "PAX5","AC117386.2","PRSS55","RPS16","GDF7","PAK4","AC022144.1","AC092745.5","AL670729.3",
    "DLEU2L","ELP3","KCNC2","MAP4K1","AL161729.4","SV2C","RGS11","AC005498.1","WFDC5","PSENEN",
    "LINC01956","AC115485.1","CYSLTR2","ASMTL-AS1","AP002001.3","FAM153B"
]
IGE_UP = [
    "PFN2","SET","LGALS1","PDLIM4","EIF5A","F8A1","BANF1","CTDNEP1","TRAPPC1","AMZ2","PTGES3","ST13",
    "FTH1","PCBP1","NAA10","ARPC5L","ETFB","CCDC124","SSR2","PGLS","SELENOH","GSTP1","NDUFB7","AURKAIP1",
    "GCHFR","MYL6","AP2S1","S100A13","C9ORF16","KRT8","PRKCSH","CST3","PET100","DNAJA2","VPS35","VDAC1",
    "MGST3","PRR13","OCIAD2","FUOM","MIA","RPN1","FUCA2","TMED4","ERGIC3","DDOST","NAXE","SNRPD1","SKP1",
    "CNBP","ATP5MC3","SLC25A5","LSM4","NDUFAB1","PCBD1","HINT1","ADI1","NENF","MRPL43","VKORC1","EMC4",
    "COA3","NEDD8","CHCHD2","PRDX5","WDR83OS","NDUFA4","GABARAP","PHPT1","UFC1","MDH1","ATP5F1E","MRPL41",
    "UBL5","COX5B","ELOB","ATP5ME","UQCRB","SEM1","NDUFB1","COX6C","ATP6V1F","HNRNPA1","HIGD2A","POLR2I",
    "METTL26","NDUFB4","OST4","C19ORF53","RPL36AL","PPIA","COPS9","COX8A","COX5A","RPL36","RPS15","RPS27",
    "RPS11","FTL","SNHG29","NACA","RPS17","RPL23","RPL39","RPS21","RPL9","RPL15","RPL34","RPL27A","RPL36A",
    "RPL38","RPS26","RPS14","RPL35","RPL37A","RPL37","RPL12","RPS29"
]
IGE_DOWN = [
    "STT3A","SGPP1","LINGO1","ASS1","CETN2","HNRNPH1","STN1","NRG4","IQSEC2","IYD","CHD9","APBB1IP",
    "TMEM238","REX1BD","IFI27","CYBA","METRNL","AKR7A2","SULT1A1","ARHGEF35","AC008397.1","KLK11",
    "DELE1","MISP","FAM234A","CMBL"
]
IL2_UP = [
    "ABHD15-AS1","AC007780.1","AC008105.3","AC008964.1","AC016831.1","AC016831.5","AC090772.1","AC108863.2",
    "ADARB1","ALOX5AP","ANKH","ANKRD44","APOBEC3G","ARHGDIB","ARL6IP5","ARSG","ATP10D","BCL11B","BORCS5",
    "CAMK1D","CD4","CD84","CD96","CDC42EP3","CELF2","CERS4","CLEC2D","CNOT6L","CRYBG1","CTSW","DSE","FGD3",
    "FMNL1","FOXN3","FOXO1","FYB1","GFI1","GNAO1","GPRIN3","HOPX","IGF1","IKZF3","IL18R1","INKA2","INSYN2B",
    "IQSEC1","ITPRIPL1","JAK3","KLRC2","KLRC3","KLRF1","KLRG1","LAPTM5","LCP1","LEPROTL1","LINC00513",
    "LINC01237","MAN1A1","MAPRE2","MPHOSPH9","MPP7","MVB12B","MYO5A","NIN","PARP11","PARP15","PCED1B-AS1",
    "PDE3B","PIP4K2A","PLCL1","PLEKHA2","PPP3CC","PRKCH","PRKCQ","PRKD3","PRKX","PTPN22","RAC2","RASGRF2",
    "RFX3","RIPOR2","RNF166","S1PR4","SAMD3","SENP7","SH2B3","SH2D2A","SMARCA2","SNHG26","SPOCK2","SRGN",
    "ST8SIA1","STAT5A","STAT5B","STIM1","STK17A","TMEM200A","TMX4","TRBC2","TRDC","TRIM22","TTN","VAV3",
    "WNT5A-AS1","ZNF101","ZNF471"
]
IL2_DOWN = [
    "RBP7","OVCA2","PLAC4","AC026785.3","LINC02212","LINC00605","AC246817.2","FOXCUT","VGLL2","ZIC4","FOLR3",
    "ECEL1","AC024337.1","C5orf58","AC060814.3","B4GALNT1","UCHL1","VAX1","AL451042.1","COMMD8","IFI30",
    "AL096794.1","RGS10"
]
MPC_UP = [
    "KIAA1211L","CASC4","AHR","YY1AP1","COPA","RELL1","FBXW2","CARMIL1","NUBPL","ZC3H18","AGK","HLCS",
    "TMEM241","URGCP","CADPS2","COBL","ARHGAP42","AC138305.1","FAM135A","SLC41A2","TACC2","MCU","FMN1",
    "LGR4","BAIAP2L1","SGPP2","RGS12","PTPN3","IBTK","SPRY4-AS1","UGT8","PNN","SP140L","PIK3CA","CNOT2",
    "ZFC3H1","UBE2K","STRN3","STAG2","KDM5A","RC3H2","TP53BP1","SLC9A8","ATF7IP","MLLT10","SCAPER","CPSF6",
    "KLC1","HUWE1","TAOK3","ROCK1","MAN2A1","ZNF44","ARGLU1","EHMT1","HTT","SIN3A","RSBN1L","MLLT3","RABGAP1",
    "SCFD1","NEMF","DMXL1","RCOR3","NAA35","WWP1","VPS41","PRRC2B","STX16","ANKRD11","ARHGEF7","SOS2","SLAIN2",
    "LUC7L2","SRPK2","TYW1","HMBOX1","LMBR1","TCF12","MBD5","MON2","COG5","LONP2","RAB3GAP2","LARP4B","NUMB",
    "NF1","NCOA2","ZNF710","CSNK1G1","WDR37","NUTM2B-AS1","ITFG1","UBE3C","MFSD14C","SPG11","PTK2","LPP",
    "ZBTB20","FCHO2","PTPN12","DCUN1D4","KDM7A","SLMAP","KIF13B","MIB1","DIP2C","LRCH1","TNIK","TNKS","SMC5",
    "ANKIB1","RBM33","TNPO3","OSBPL3","RAPGEF2","MED13L","FBXL20","KIAA0232","ITCH","MARCH6","ARID4B","PLEKHA6",
    "FNBP1L","KIAA1217","FARP2","MAGI3","DIAPH2","VPS13A","DGKH","GNAQ","ARHGEF12","MYO1D","CDC42BPA","TTC7A",
    "TRIM44","NSD3","NCOA3","ADNP","RUFY3","RUNX1","TRRAP","PTBP2","ZNF609","CHD7","PARP8","ARIH1","ZHX2",
    "ETV6","CUL1","CDK13","BRAF","MBTD1","AUH","STX17","POLR2J3","KLHDC10","TAF1","TOGARAM1","WRN","ADK",
    "TASOR2","NUDCD3","TBL1X","MOSMO","USP42","CRCP","GALNT11","DDI2","TTC14","UPF2","PDXDC1","ETFDH","SEL1L3",
    "HEATR5A","PSMD5","TBC1D2B","NUP214","GAPVD1","RNF19A","SMURF2","NRIP1","WDFY2","TBK1","LCORL","USP25",
    "RABGEF1","CASD1","RBPJ","AEBP2","MALAT1","MAML2","RBMS1","CRIM1","SNX29","PPFIBP1","SVIL","DDX24","G2E3",
    "RASA1","FAM160A1","RNF24","ESYT2","EPS8","SESTD1","ATP2B4","PELI1","RNF145","B4GALT5","PPP2R2A","NOS1AP",
    "EGFR","CPNE8","ARHGAP21","SMAD3","DAPK1","IGF1R","AFAP1","KLF7","DOCK5","MINDY2","ZXDC","NEDD4","METTL15",
    "FNIP2","NEAT1","CELF1","DANT2","SYT17","TMEM245","ERICH1","ADPGK","LMBRD1","MFSD11","GOLGA2","SCNN1A",
    "XDH","PLEKHA1","PPP2CB","GTF2E2","KCNK1","SPATS2L","RTF1","UACA","VAV2","MDFIC","STAM","CLIP4","KYAT1",
    "MECP2","NUP160","THOC1","LINC-PINT","MCPH1","DISC1","UBA6-AS1","AAK1","NRF1","PHF20","RNF216","PCM1","SAFB2",
    "FAM133B","NR6A1","ATL2","C1orf21","MTUS1","PARD3B","EXT1","ST5","ABI2","INTS6L","TRIM56","PTER","PRR12",
    "RSPH3","TMC5","MECOM","ARHGEF10L","HNF4G","PPM1H","AP005230.1","AC119674.1","ZSCAN18","NOL4L","PDPK1",
    "RSU1","TTC39C","COL27A1","SEC16B","AC005162.3","RASAL1","IL17RA","SPART-AS1","CDC37","MUCL3","TMEM178B",
    "LINC02614","EREG","ELK3","TMTC1","PALM2-AKAP2","RAB31","EMP3"
]
MPC_DOWN = [
    "MRPS15","TALDO1","MDH2","PSMB5","PIK3CD-AS2","DUSP9","POPDC3","AKR1C3","MRPS18A","EIF3H","UQCRC2",
    "IFT22","RAB11B","MBOAT7"
]
# =====================================

def parse_genes(obj):
    """Accept str/list/tuple/set/Series; return a clean list of symbols."""
    if obj is None:
        return []
    # pandas Series/Index
    if hasattr(obj, "tolist"):
        obj = obj.tolist()
    # list-like (but not str/bytes)
    if isinstance(obj, Iterable) and not isinstance(obj, (str, bytes)):
        flat = []
        for x in obj:
            if x is None: continue
            if hasattr(x, "tolist"):
                flat.extend(x.tolist())
            elif isinstance(x, (str, bytes)):
                flat.extend(re.split(r"[,\s]+", x.strip()))
            else:
                flat.append(str(x))
        genes = [g for g in flat if g]
        return genes
    # string path: split on commas/whitespace/newlines
    s = obj.strip()
    if not s:
        return []
    s = re.sub(r"[,\s]+", "\n", s)
    return [g for g in s.splitlines() if g]

# Build modules (normalize keys to lower-case)
MODULES = {
    "SAT": {"up": parse_genes(SAT_UP), "down": parse_genes(SAT_DOWN)},
    "IGE": {"up": parse_genes(IGE_UP), "down": parse_genes(IGE_DOWN)},
    "IL2": {"up": parse_genes(IL2_UP), "down": parse_genes(IL2_DOWN)},
    "MPC": {"up": parse_genes(MPC_UP), "down": parse_genes(MPC_DOWN)},
}

def score_signed_module(expr_df, up_genes, down_genes, method="count_weighted", min_genes=3):
    up = pd.Index([g.upper() for g in up_genes])
    dn = pd.Index([g.upper() for g in down_genes])

    expr_uc = expr_df.copy()
    expr_uc.index = expr_uc.index.astype(str).str.upper()
    expr_uc = expr_uc[~expr_uc.index.duplicated(keep="first")]

    up_in = up.intersection(expr_uc.index)
    dn_in = dn.intersection(expr_uc.index)
    n_up, n_dn = len(up_in), len(dn_in)
    print(f"[COVERAGE] UP {n_up}/{len(up)} | DOWN {n_dn}/{len(dn)}")

    if n_up + n_dn == 0:
        raise ValueError("No module genes found in expression. Check symbols.")

    if method == "count_weighted":
        # (sum(up) - sum(down)) / (n_up + n_dn)  → side with more genes has more influence
        up_sum = expr_uc.loc[up_in].sum(axis=0) if n_up else 0.0
        dn_sum = expr_uc.loc[dn_in].sum(axis=0) if n_dn else 0.0
        signed = (up_sum - dn_sum) / float(n_up + n_dn)
    elif method == "equal_sides":
        up_mean = expr_uc.loc[up_in].mean(axis=0) if n_up else 0.0
        dn_mean = expr_uc.loc[dn_in].mean(axis=0) if n_dn else 0.0
        signed = up_mean - dn_mean
    else:
        raise ValueError("method must be 'count_weighted' or 'equal_sides'")

    signed = pd.Series(signed, index=expr_uc.columns).astype(float)
    z = (signed - signed.mean()) / signed.std(ddof=0)
    return signed, z, {"n_up": n_up, "n_dn": n_dn}

OUT = Path("out_modules"); OUT.mkdir(exist_ok=True)
coverage = []
for label, gd in MODULES.items():
    print(f"\n=== {label} (count_weighted) ===")
    raw, z, cov = score_signed_module(expr, gd["up"], gd["down"], method="count_weighted", min_genes=3)
    raw.rename(f"{label}_signed_score").to_csv(OUT / f"{label}_signed_score.csv")
    z.rename(f"{label}_signed_score_Z").to_csv(OUT / f"{label}_signed_score_Z.csv")
    coverage.append({"module": label, **cov})
    print(f"[saved] {OUT / f'{label}_signed_score.csv'}")
    print(f"[saved] {OUT / f'{label}_signed_score_Z.csv'}")

pd.DataFrame(coverage).to_csv(OUT / "coverage_report.csv", index=False)
print("\n[done] Wrote Z-scores to:", OUT)


[expr] (19098, 51)

=== SAT (count_weighted) ===
[COVERAGE] UP 101/185 | DOWN 14/25
[saved] out_modules/SAT_signed_score.csv
[saved] out_modules/SAT_signed_score_Z.csv

=== IGE (count_weighted) ===
[COVERAGE] UP 115/118 | DOWN 25/26
[saved] out_modules/IGE_signed_score.csv
[saved] out_modules/IGE_signed_score_Z.csv

=== IL2 (count_weighted) ===
[COVERAGE] UP 91/106 | DOWN 13/23
[saved] out_modules/IL2_signed_score.csv
[saved] out_modules/IL2_signed_score_Z.csv

=== MPC (count_weighted) ===
[COVERAGE] UP 283/304 | DOWN 13/14
[saved] out_modules/MPC_signed_score.csv
[saved] out_modules/MPC_signed_score_Z.csv

[done] Wrote Z-scores to: out_modules


In [9]:
import pandas as pd, numpy as np
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests
from pathlib import Path

OUTDIR = Path("out_prism_corr"); OUTDIR.mkdir(exist_ok=True)

# --- load aligned data ---
expr_aln  = pd.read_csv("out_prism_align/expr_genesXACH_shared.csv", index_col=0)
prism_aln = pd.read_csv("out_prism_align/prism_ACHXdrugs_shared.csv", index_col=0)

# --- load module Z-scores ---
mods = {}
for m in ["SAT","IGE","IL2","MPC"]:
    z = pd.read_csv(f"out_modules/{m}_signed_score_Z.csv", index_col=0).squeeze("columns")
    z.name = m
    mods[m] = z

shared = prism_aln.index.intersection(expr_aln.columns)
print(f"Shared ACH lines: {len(shared)}")

prism_sub = prism_aln.loc[shared, :]

# --- correlations ---
records = []
for mod, scores in mods.items():
    s = scores.loc[shared]
    for drug in prism_sub.columns:
        y = prism_sub[drug]
        if y.isna().sum() >= len(y)-3:
            continue
        rho, p = spearmanr(s, y, nan_policy="omit")
        records.append((mod, drug, rho, p))

df = pd.DataFrame(records, columns=["module","drug","rho","pval"])
df["FDR"] = np.nan
for m in df["module"].unique():
    mask = df["module"]==m
    df.loc[mask,"FDR"] = multipletests(df.loc[mask,"pval"], method="fdr_bh")[1]

# --- save results ---
df.to_csv(OUTDIR / "module_vs_PRISM_spearman.csv", index=False)

tops = (
    df.sort_values(["module","FDR"])
      .groupby("module")
      .head(15)
      .reset_index(drop=True)
)
tops.to_csv(OUTDIR / "module_vs_PRISM_top15.csv", index=False)
print(tops.head(15))


Shared ACH lines: 30
   module                           drug       rho      pval       FDR
0     IGE                    talazoparib -0.673860  0.000045  0.064579
1     IGE                       2,3-DCPE -0.627258  0.000354  0.112800
2     IGE  bis(maltolato)oxovanadium(IV) -0.634921  0.000374  0.112800
3     IGE                      indisulam -0.609344  0.000352  0.112800
4     IGE                  methocarbamol  0.729825  0.000390  0.112800
5     IGE                     golvatinib  0.594575  0.000670  0.161677
6     IGE    bephenium-hydroxynaphthoate -0.613846  0.001099  0.198861
7     IGE                    dacomitinib  0.568854  0.001037  0.198861
8     IGE                     BMS-690514  0.535039  0.002316  0.240133
9     IGE                         XL-647  0.544383  0.001871  0.240133
10    IGE                      alpelisib  0.548590  0.002060  0.240133
11    IGE               cyclophosphamide  0.664912  0.001896  0.240133
12    IGE                        lacitol -0.614907  0.00

In [10]:
import matplotlib
matplotlib.use("Agg")  # safe, non-GUI backend to avoid local display/ttf issues

import pandas as pd, numpy as np, re
import matplotlib.pyplot as plt
from pathlib import Path
from scipy.stats import spearmanr

# ---------------- IO ----------------
BASE = Path(".")
CORR_PATH   = BASE/"out_prism_corr/module_vs_PRISM_spearman.csv"
PRISM_PATH  = BASE/"out_prism_align/prism_ACHXdrugs_shared.csv"
MOD_DIR     = BASE/"out_modules"
FIG_DIR     = BASE/"out_figs"; FIG_DIR.mkdir(exist_ok=True)

MODULES = ["SAT","IGE","IL2","MPC"]
TOP_N_VOLC_ANNOT = 8
TOP_N_HEATMAP    = 10
SAVE_FORMATS = ["png"]  # was ["png","svg"] # <-- no pdf

# ---------------- Load data ----------------
df = pd.read_csv(CORR_PATH)
prism = pd.read_csv(PRISM_PATH, index_col=0)

modZ = {}
for m in MODULES:
    s = pd.read_csv(MOD_DIR/f"{m}_signed_score_Z.csv", index_col=0).squeeze("columns")
    s.name = m
    modZ[m] = s

# shared ACH lines across everything
shared = prism.index
for m in MODULES:
    shared = shared.intersection(modZ[m].index)
if len(shared) == 0:
    raise RuntimeError("No shared ACH lines between PRISM and module Z-scores.")
prism = prism.loc[shared, :]
for m in MODULES:
    modZ[m] = modZ[m].loc[shared]

def neglog10(x):
    x = np.asarray(x, dtype=float)
    nz = x[x>0]
    eps = (nz.min()/2.0) if len(nz)>0 else 1e-300
    x = np.where(x>0, x, eps)
    return -np.log10(x)

def safe_name(s):
    return re.sub(r"[^A-Za-z0-9._-]+", "_", str(s))

# ---------------- Volcano plots ----------------
for m in MODULES:
    sub = df[df["module"]==m].copy()
    sub["neglog10FDR"] = neglog10(sub["FDR"])
    lab = sub.sort_values(["FDR","rho"], ascending=[True, True]).head(TOP_N_VOLC_ANNOT)

    plt.figure(figsize=(5,4))
    plt.scatter(sub["rho"], sub["neglog10FDR"], s=10)
    for _, r in lab.iterrows():
        plt.annotate(r["drug"], (r["rho"], r["neglog10FDR"]),
                     xytext=(3,3), textcoords="offset points", fontsize=7)
    plt.axvline(0, linestyle="--", linewidth=1)
    plt.xlabel("Spearman ρ (module Z  vs  PRISM AUC)")
    plt.ylabel("-log10(FDR)")
    plt.title(f"{m}: Drug-response associations (PRISM)")
    for ext in SAVE_FORMATS:
        plt.savefig(FIG_DIR/f"{m}_volcano.{ext}", bbox_inches="tight", dpi=300)
    plt.close()

# ---------------- Heatmap ----------------
drug_set = []
for m in MODULES:
    sub = df[df["module"]==m].copy().sort_values("FDR", ascending=True).head(TOP_N_HEATMAP)
    drug_set.extend(sub["drug"].tolist())
drug_list = sorted(set(drug_set))

heat = pd.DataFrame(index=MODULES, columns=drug_list, dtype=float)
for m in MODULES:
    sub = df[df["module"]==m][["drug","rho"]].drop_duplicates().set_index("drug")["rho"]
    heat.loc[m, :] = sub.reindex(drug_list).values

H = heat.fillna(0.0).values
plt.figure(figsize=(max(8, len(drug_list)*0.22), 2.2+0.35*len(MODULES)))
im = plt.imshow(H, aspect="auto")
plt.colorbar(im, fraction=0.025, pad=0.02)
plt.xticks(range(len(drug_list)), drug_list, rotation=80, ha="right", fontsize=7)
plt.yticks(range(len(MODULES)), MODULES)
plt.title("Module × Drug correlation (ρ) — top hits union")
plt.tight_layout()
for ext in SAVE_FORMATS:
    plt.savefig(FIG_DIR/f"modules_drug_corr_heatmap.{ext}", bbox_inches="tight", dpi=300)
plt.close()

# ---------------- Exemplar scatterplots ----------------
def best_drugs_for_module(m):
    sub = df[df["module"]==m].copy()
    neg = sub.sort_values("rho", ascending=True).iloc[0]
    pos = sub.sort_values("rho", ascending=False).iloc[0]
    return neg, pos

for m in MODULES:
    s = modZ[m]
    neg, pos = best_drugs_for_module(m)

    for tag, row in [("sensitive_neg_rho", neg), ("resistant_pos_rho", pos)]:
        drug = row["drug"]
        y = prism[drug]
        valid = s.index.intersection(y.dropna().index)
        rho, p = spearmanr(s.loc[valid], y.loc[valid], nan_policy="omit")

        plt.figure(figsize=(4.2,3.6))
        plt.scatter(s.loc[valid], y.loc[valid], s=14)
        plt.xlabel(f"{m} module Z-score")
        plt.ylabel(f"{drug} PRISM AUC")
        plt.title(f"{m} vs {drug}\nρ={rho:.2f}, p={p:.2g}")
        for ext in SAVE_FORMATS:
            plt.savefig(FIG_DIR/f"{m}_{tag}_{safe_name(drug)}.{ext}", bbox_inches="tight", dpi=300)
        plt.close()

print("Saved figures to:", FIG_DIR.resolve())
for p in sorted(FIG_DIR.glob("*")):
    print(" -", p.name)


Saved figures to: /Users/scottpowers/My Drive/out_figs
 - IGE_neg_lollipop.png
 - IGE_neg_mech_meanrho.png
 - IGE_resistant_pos_rho_thioproperazine.png
 - IGE_sensitive_neg_rho_talazoparib.png
 - IGE_volcano.png
 - IL2_neg_lollipop.png
 - IL2_neg_mech_meanrho.png
 - IL2_resistant_pos_rho_k-strophanthidin.png
 - IL2_sensitive_neg_rho_acetophenazine.png
 - IL2_volcano.png
 - MPC_neg_lollipop.png
 - MPC_neg_mech_meanrho.png
 - MPC_resistant_pos_rho_SCS.png
 - MPC_sensitive_neg_rho_fendiline.png
 - MPC_volcano.png
 - SAT_neg_lollipop.png
 - SAT_neg_mech_meanrho.png
 - SAT_resistant_pos_rho_SCS.png
 - SAT_sensitive_neg_rho_BVT-948.png
 - SAT_volcano.png
 - mechanism_mean_rho_by_module.png
 - modules_drug_corr_heatmap.png


In [11]:
# ===== Focus on vulnerabilities only: rho < 0 (module-high => more sensitive) =====
# Outputs:
#  - out_prism_corr/module_negatives_top.csv  (annotated table)
#  - out_figs/{MODULE}_neg_lollipop.png       (top sensitive drugs per module)
#  - out_figs/{MODULE}_neg_mech_meanrho.png   (mean rho by mechanism)
# ================================================================================

import matplotlib
matplotlib.use("Agg")  # safe, non-GUI backend

import pandas as pd, numpy as np, re
import matplotlib.pyplot as plt
from pathlib import Path

# ---------------- Paths / settings ----------------
BASE = Path(".")
CORR_PATH = BASE / "out_prism_corr/module_vs_PRISM_spearman.csv"
OUTDIR    = BASE / "out_prism_corr"; OUTDIR.mkdir(exist_ok=True)
FIG_DIR   = BASE / "out_figs"; FIG_DIR.mkdir(exist_ok=True)

MODULES = ["SAT","IGE","IL2","MPC"]
TOP_N_PER_MODULE = 15       # top negatives to label per module
MIN_MECH_COUNT   = 2        # group sparse mechanisms into "Other (misc.)" for cleaner plots

# ---------------- Load ----------------
df = pd.read_csv(CORR_PATH)
assert {"module","drug","rho","FDR"}.issubset(df.columns)

# Keep only requested modules and NEGATIVE rho (sensitivities)
df = df[df["module"].isin(MODULES)].copy()
neg = df[df["rho"] < 0].copy()

# ---------------- Mechanism map (edit/extend freely; case-insensitive) ----------------
MECH_MAP_RAW = {
    # DNA repair / replication stress
    "talazoparib": "PARP inhibitor / DNA repair",
    "3-amino-benzamide": "PARP inhibitor (early gen)",
    "10-hydroxycamptothecin": "Topoisomerase I inhibitor",
    "5-fluorouracil": "Antimetabolite (TS inhibitor)",
    "cyclophosphamide": "Alkylating agent (DNA cross-linker)",

    # Splicing / RNA processing
    "indisulam": "Spliceosome / RNA-processing inhibitor",

    # Oxidative / ROS / redox & checkpoint stress
    "bis(maltolato)oxovanadium(iv)": "Oxidative-stress inducer (vanadium)",
    "2,3-dcpe": "DNA damage / S-phase checkpoint inducer",
    "2-methoxyestradiol": "Microtubule disruptor / HIF-1 inhibitor",

    # RTK axis (usually appears as POS rho; included for completeness if any NEG show)
    "alpelisib": "PI3Kα inhibitor",
    "dacomitinib": "ERBB/EGFR inhibitor (pan-ERBB)",
    "bms-690514": "Multi-RTK inhibitor (ErbB/FGF/VEGFR)",
    "xl-647": "EGFR/VEGFR/ERBB inhibitor",
    "golvatinib": "MET/VEGFR/RTK inhibitor",

    # GPCR/hormonal/other (some may show NEG via off-targets)
    "bephenium-hydroxynaphthoate": "Anthelmintic (nicotinic agonist)",
    "lacitol": "Osmotic laxative / carbohydrate analog",
}

def norm_name(s: str) -> str:
    s = str(s).lower().strip()
    s = s.replace("’", "'").replace("“","\"").replace("”","\"")
    s = s.replace("–","-").replace("—","-")
    s = re.sub(r"\s+", " ", s)
    return s

MECH_MAP = {k.strip().lower(): v for k, v in MECH_MAP_RAW.items() if isinstance(k, str)}
neg["drug_norm"] = neg["drug"].map(norm_name)
neg["mechanism"] = neg["drug_norm"].map(MECH_MAP).fillna("Unknown/Other")

# ---------------- Select top (FDR, then most negative rho) per module ----------------
tops = []
for m in MODULES:
    sub = neg[neg["module"]==m].copy()
    sub = sub.sort_values(["FDR", "rho"], ascending=[True, True])  # rho ascending → most negative first
    tops.append(sub.head(TOP_N_PER_MODULE))
top_neg = pd.concat(tops, ignore_index=True)

# Save annotated table
out_csv = OUTDIR / "module_negatives_top.csv"
top_neg[["module","drug","mechanism","rho","FDR"]].to_csv(out_csv, index=False)
print(f"[saved] {out_csv}")

# ---------------- Optionally collapse sparse mechanisms for plotting ----------------
mech_counts = top_neg["mechanism"].value_counts()
sparse = set(mech_counts[mech_counts < MIN_MECH_COUNT].index)
def mech_for_plot(x): return "Other (misc.)" if x in sparse else x
top_neg["mechanism_plot"] = top_neg["mechanism"].map(mech_for_plot)

# ---------------- Per-module lollipop of top sensitive drugs ----------------
for m in MODULES:
    sub = top_neg[top_neg["module"]==m].copy()
    if sub.empty:
        print(f"[warn] No negative-rho hits for {m}.")
        continue

    # sort by rho (most negative at bottom for prettier up-left stems)
    sub = sub.sort_values("rho", ascending=True)
    ylabels = sub["drug"].tolist()
    y = np.arange(len(sub))

    plt.figure(figsize=(8, max(4, 0.35*len(sub)+1)))
    # stems
    for i, rho in enumerate(sub["rho"]):
        plt.plot([rho, 0], [i, i], linewidth=1)
    # markers at rho
    plt.scatter(sub["rho"], y, s=20)
    plt.axvline(0, linestyle="--", linewidth=1)
    plt.yticks(y, ylabels)
    plt.xlabel("Spearman ρ (module Z vs PRISM AUC)  —  ρ < 0 ⇒ sensitivity")
    plt.title(f"{m} — Top sensitive drugs (negative ρ)")
    plt.tight_layout()
    outp = FIG_DIR / f"{m}_neg_lollipop.png"
    plt.savefig(outp, dpi=300)
    plt.close()
    print(f"[saved] {outp}")

# ---------------- Per-module mechanism summary (mean rho among negatives) ----------------
for m in MODULES:
    sub = top_neg[top_neg["module"]==m]
    if sub.empty:
        continue
    g = sub.groupby("mechanism_plot")["rho"].mean().sort_values()  # more negative on top
    plt.figure(figsize=(8, max(4, 0.4*len(g)+1)))
    y = np.arange(len(g))
    plt.barh(y, g.values)
    plt.axvline(0, linestyle="--", linewidth=1)
    plt.yticks(y, g.index.tolist())
    plt.xlabel("Mean Spearman ρ (more negative ⇒ stronger sensitivity)")
    plt.title(f"{m} — Mechanism classes among top sensitive hits")
    plt.tight_layout()
    outp = FIG_DIR / f"{m}_neg_mech_meanrho.png"
    plt.savefig(outp, dpi=300)
    plt.close()
    print(f"[saved] {outp}")

print("\nINTERPRETATION NOTES:")
print(" - We’re showing drugs with ρ < 0 only: module-high lines are more sensitive (lower AUC).")
print(" - Lollipop: each stem ends at the drug’s ρ (more negative = stronger sensitivity).")
print(" - Mechanism plot: average ρ per mechanism among top hits (more negative = stronger class-level vulnerability).")
print(" - Refine MECH_MAP_RAW above to replace 'Unknown/Other' with precise classes (PARP, JAK/STAT, ERBB, Complex I, etc.).")


[saved] out_prism_corr/module_negatives_top.csv
[saved] out_figs/SAT_neg_lollipop.png
[saved] out_figs/IGE_neg_lollipop.png
[saved] out_figs/IL2_neg_lollipop.png
[saved] out_figs/MPC_neg_lollipop.png
[saved] out_figs/SAT_neg_mech_meanrho.png
[saved] out_figs/IGE_neg_mech_meanrho.png
[saved] out_figs/IL2_neg_mech_meanrho.png
[saved] out_figs/MPC_neg_mech_meanrho.png

INTERPRETATION NOTES:
 - We’re showing drugs with ρ < 0 only: module-high lines are more sensitive (lower AUC).
 - Lollipop: each stem ends at the drug’s ρ (more negative = stronger sensitivity).
 - Mechanism plot: average ρ per mechanism among top hits (more negative = stronger class-level vulnerability).
 - Refine MECH_MAP_RAW above to replace 'Unknown/Other' with precise classes (PARP, JAK/STAT, ERBB, Complex I, etc.).


In [12]:
# ===== Focus on vulnerabilities only: rho < 0 (module-high => more sensitive) =====
# Outputs:
#  - out_prism_corr/module_negatives_top.csv  (annotated table)
#  - out_figs/{MODULE}_neg_lollipop.png       (top sensitive drugs per module)
#  - out_figs/{MODULE}_neg_mech_meanrho.png   (mean rho by mechanism)
# ================================================================================

import matplotlib
matplotlib.use("Agg")  # safe, non-GUI backend

import pandas as pd, numpy as np, re
import matplotlib.pyplot as plt
from pathlib import Path

# ---------------- Paths / settings ----------------
BASE = Path(".")
CORR_PATH = BASE / "out_prism_corr/module_vs_PRISM_spearman.csv"
OUTDIR    = BASE / "out_prism_corr"; OUTDIR.mkdir(exist_ok=True)
FIG_DIR   = BASE / "out_figs"; FIG_DIR.mkdir(exist_ok=True)

MODULES = ["SAT","IGE","IL2","MPC"]
TOP_N_PER_MODULE = 15       # top negatives to label per module
MIN_MECH_COUNT   = 2        # group sparse mechanisms into "Other (misc.)" for cleaner plots

# ---------------- Load ----------------
df = pd.read_csv(CORR_PATH)
assert {"module","drug","rho","FDR"}.issubset(df.columns)

# Keep only requested modules and NEGATIVE rho (sensitivities)
df = df[df["module"].isin(MODULES)].copy()
neg = df[df["rho"] < 0].copy()

# ---------------- Mechanism map (edit/extend freely; case-insensitive) ----------------
MECH_MAP_RAW = {
    # DNA repair / replication stress
    "talazoparib": "PARP inhibitor / DNA repair",
    "3-amino-benzamide": "PARP inhibitor (early gen)",
    "10-hydroxycamptothecin": "Topoisomerase I inhibitor",
    "5-fluorouracil": "Antimetabolite (TS inhibitor)",
    "cyclophosphamide": "Alkylating agent (DNA cross-linker)",

    # Splicing / RNA processing
    "indisulam": "Spliceosome / RNA-processing inhibitor",

    # Oxidative / ROS / redox & checkpoint stress
    "bis(maltolato)oxovanadium(iv)": "Oxidative-stress inducer (vanadium)",
    "2,3-dcpe": "DNA damage / S-phase checkpoint inducer",
    "2-methoxyestradiol": "Microtubule disruptor / HIF-1 inhibitor",

    # RTK axis (usually appears as POS rho; included for completeness if any NEG show)
    "alpelisib": "PI3Kα inhibitor",
    "dacomitinib": "ERBB/EGFR inhibitor (pan-ERBB)",
    "bms-690514": "Multi-RTK inhibitor (ErbB/FGF/VEGFR)",
    "xl-647": "EGFR/VEGFR/ERBB inhibitor",
    "golvatinib": "MET/VEGFR/RTK inhibitor",

    # GPCR/hormonal/other (some may show NEG via off-targets)
    "bephenium-hydroxynaphthoate": "Anthelmintic (nicotinic agonist)",
    "lacitol": "Osmotic laxative / carbohydrate analog",
}

def norm_name(s: str) -> str:
    s = str(s).lower().strip()
    s = s.replace("’", "'").replace("“","\"").replace("”","\"")
    s = s.replace("–","-").replace("—","-")
    s = re.sub(r"\s+", " ", s)
    return s

MECH_MAP = {k.strip().lower(): v for k, v in MECH_MAP_RAW.items() if isinstance(k, str)}
neg["drug_norm"] = neg["drug"].map(norm_name)
neg["mechanism"] = neg["drug_norm"].map(MECH_MAP).fillna("Unknown/Other")

# ---------------- Select top (FDR, then most negative rho) per module ----------------
tops = []
for m in MODULES:
    sub = neg[neg["module"]==m].copy()
    sub = sub.sort_values(["FDR", "rho"], ascending=[True, True])  # rho ascending → most negative first
    tops.append(sub.head(TOP_N_PER_MODULE))
top_neg = pd.concat(tops, ignore_index=True)

# Save annotated table
out_csv = OUTDIR / "module_negatives_top.csv"
top_neg[["module","drug","mechanism","rho","FDR"]].to_csv(out_csv, index=False)
print(f"[saved] {out_csv}")

# ---------------- Optionally collapse sparse mechanisms for plotting ----------------
mech_counts = top_neg["mechanism"].value_counts()
sparse = set(mech_counts[mech_counts < MIN_MECH_COUNT].index)
def mech_for_plot(x): return "Other (misc.)" if x in sparse else x
top_neg["mechanism_plot"] = top_neg["mechanism"].map(mech_for_plot)

# ---------------- Per-module lollipop of top sensitive drugs ----------------
for m in MODULES:
    sub = top_neg[top_neg["module"]==m].copy()
    if sub.empty:
        print(f"[warn] No negative-rho hits for {m}.")
        continue

    # sort by rho (most negative at bottom for prettier up-left stems)
    sub = sub.sort_values("rho", ascending=True)
    ylabels = sub["drug"].tolist()
    y = np.arange(len(sub))

    plt.figure(figsize=(8, max(4, 0.35*len(sub)+1)))
    # stems
    for i, rho in enumerate(sub["rho"]):
        plt.plot([rho, 0], [i, i], linewidth=1)
    # markers at rho
    plt.scatter(sub["rho"], y, s=20)
    plt.axvline(0, linestyle="--", linewidth=1)
    plt.yticks(y, ylabels)
    plt.xlabel("Spearman ρ (module Z vs PRISM AUC)  —  ρ < 0 ⇒ sensitivity")
    plt.title(f"{m} — Top sensitive drugs (negative ρ)")
    plt.tight_layout()
    outp = FIG_DIR / f"{m}_neg_lollipop.png"
    plt.savefig(outp, dpi=300)
    plt.close()
    print(f"[saved] {outp}")

# ---------------- Per-module mechanism summary (mean rho among negatives) ----------------
for m in MODULES:
    sub = top_neg[top_neg["module"]==m]
    if sub.empty:
        continue
    g = sub.groupby("mechanism_plot")["rho"].mean().sort_values()  # more negative on top
    plt.figure(figsize=(8, max(4, 0.4*len(g)+1)))
    y = np.arange(len(g))
    plt.barh(y, g.values)
    plt.axvline(0, linestyle="--", linewidth=1)
    plt.yticks(y, g.index.tolist())
    plt.xlabel("Mean Spearman ρ (more negative ⇒ stronger sensitivity)")
    plt.title(f"{m} — Mechanism classes among top sensitive hits")
    plt.tight_layout()
    outp = FIG_DIR / f"{m}_neg_mech_meanrho.png"
    plt.savefig(outp, dpi=300)
    plt.close()
    print(f"[saved] {outp}")

print("\nINTERPRETATION NOTES:")
print(" - We’re showing drugs with ρ < 0 only: module-high lines are more sensitive (lower AUC).")
print(" - Lollipop: each stem ends at the drug’s ρ (more negative = stronger sensitivity).")
print(" - Mechanism plot: average ρ per mechanism among top hits (more negative = stronger class-level vulnerability).")
print(" - Refine MECH_MAP_RAW above to replace 'Unknown/Other' with precise classes (PARP, JAK/STAT, ERBB, Complex I, etc.).")


[saved] out_prism_corr/module_negatives_top.csv
[saved] out_figs/SAT_neg_lollipop.png
[saved] out_figs/IGE_neg_lollipop.png
[saved] out_figs/IL2_neg_lollipop.png
[saved] out_figs/MPC_neg_lollipop.png
[saved] out_figs/SAT_neg_mech_meanrho.png
[saved] out_figs/IGE_neg_mech_meanrho.png
[saved] out_figs/IL2_neg_mech_meanrho.png
[saved] out_figs/MPC_neg_mech_meanrho.png

INTERPRETATION NOTES:
 - We’re showing drugs with ρ < 0 only: module-high lines are more sensitive (lower AUC).
 - Lollipop: each stem ends at the drug’s ρ (more negative = stronger sensitivity).
 - Mechanism plot: average ρ per mechanism among top hits (more negative = stronger class-level vulnerability).
 - Refine MECH_MAP_RAW above to replace 'Unknown/Other' with precise classes (PARP, JAK/STAT, ERBB, Complex I, etc.).


In [13]:
# ================= DepMap Chronos (rows=samples, cols=genes) → genes×ACH; correlate vs module Z =================
import pandas as pd, numpy as np, re
from pathlib import Path
from glob import glob
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

# ----- your PDAC name map -----
mapping_dict = {
    "ACH-000022": "PATU8988S", "ACH-000023": "PATU8988T", "ACH-000094": "HPAFII",
    "ACH-000108": "KP3", "ACH-000114": "SU8686", "ACH-000118": "HUPT3",
    "ACH-000138": "CFPAC1", "ACH-000178": "HS766T", "ACH-000205": "PK59",
    "ACH-000213": "HUPT4", "ACH-000222": "ASPC1", "ACH-000265": "KP4",
    "ACH-000307": "PK1", "ACH-000332": "YAPC", "ACH-000354": "CAPAN1",
    "ACH-000502": "TCCPAN2", "ACH-000517": "SNU410", "ACH-000652": "SUIT2",
    "ACH-000685": "L33", "ACH-001376": "PACADD135", "ACH-001379": "PACADD161",
    "ACH-001380": "PACADD165", "ACH-001382": "PACADD188", "ACH-002039": "PK8",
    "ACH-003161": "ABMT9430", "ACH-003433": "CCLFPANC0019T"
}
ach_to_name = {k.upper(): v for k, v in mapping_dict.items()}

# ----- paths -----
BASE       = Path(".")
ALIGN_DIR  = BASE/"out_prism_align"
MODULE_DIR = BASE/"out_modules"
OUTDIR     = BASE/"out_depmap_corr"; OUTDIR.mkdir(exist_ok=True)

# ----- shared ACH from PRISM align -----
prism_aln = pd.read_csv(ALIGN_DIR/"prism_ACHXdrugs_shared.csv", index_col=0)
shared_ach = prism_aln.index.astype(str).str.upper().tolist()
shared_set = set(shared_ach)
print(f"[align] Using {len(shared_ach)} shared ACH lines.")

# ----- find/load Chronos (the 'inverted' wide file: rows=samples, cols=genes) -----
def find_first(patterns):
    for pat in patterns:
        hits = sorted(glob(str(pat)))
        if hits: return Path(hits[0])
    return None

cand = find_first([
    BASE/"CRISPR_*Chronos*subsetted*.csv", BASE/"CRISPR_*Chronos*.csv",
    Path.home()/"Desktop/CRISPR_*Chronos*subsetted*.csv", Path.home()/"Desktop/CRISPR_*Chronos*.csv",
    Path.home()/"Downloads/CRISPR_*Chronos*subsetted*.csv", Path.home()/"Downloads/CRISPR_*Chronos*.csv",
])
assert cand is not None, "Chronos CSV not found."
dep = pd.read_csv(cand)
print("[dep raw]", dep.shape)

# Heuristic: if MANY column names look like gene symbols (A1BG, A1CF, ...),
# assume this is SAMPLES × GENES and the first column is a sample identifier.
cols_lower = [c.lower() for c in dep.columns]
# pick an ID column if present, else use first column
id_col = next((c for c in dep.columns if c.lower() in {"depmap_id","modelid","model_id","ach_id","sample","cell_line"}), dep.columns[0])
dep = dep.set_index(id_col)
print("[dep samples×genes]", dep.shape)

# ----- extract ACH IDs from the ROW index -----
def extract_ach(s):
    s = str(s)
    m = re.search(r"ACH-\d{6}", s, flags=re.IGNORECASE)
    return m.group(0).upper() if m else None

row_ach = pd.Index([extract_ach(x) for x in dep.index], name="ACH")
mask = [a in shared_set for a in row_ach]
if sum(mask) == 0:
    # sometimes the raw index IS the ACH already
    row_ach = pd.Index([x.upper() if isinstance(x,str) and x.upper().startswith("ACH-") else None for x in dep.index], name="ACH")
    mask = [a in shared_set for a in row_ach]

assert sum(mask) > 0, "Could not extract any ACH-###### from row index; show a few row names to adjust parser."

# keep only shared ACH rows & set ACH as index
dep_rows = dep.loc[mask].copy()
dep_rows.index = row_ach[mask]
dep_rows = dep_rows.loc[dep_rows.index.intersection(shared_ach)]
print("[dep filtered rows (ACH×genes)]", dep_rows.shape)

# ----- transpose to GENES × ACH (what the correlator expects) -----
wide_ach = dep_rows.T  # genes × ACH
# reorder ACH columns to shared order
wide_ach = wide_ach.reindex(columns=shared_ach)
print("[dep genes×ACH]", wide_ach.shape)

# ----- dependency strength = −Chronos (Chronos more negative = more essential) -----
dep_strength = -wide_ach

# ----- load module Z and align -----
mods = {}
for m in ["SAT","IGE","IL2","MPC"]:
    s = pd.read_csv(MODULE_DIR/f"{m}_signed_score_Z.csv", index_col=0).squeeze("columns")
    s = s.reindex(shared_ach).astype(float)
    s.name = m
    mods[m] = s

# ----- correlate (Spearman) per gene -----
records = []
for m, s in mods.items():
    D = dep_strength.loc[:, s.index]  # genes × ACH
    for gene, row in D.iterrows():
        if row.isna().sum() >= len(row) - 3:
            continue
        rho, p = spearmanr(s, row, nan_policy="omit")
        records.append((m, gene, rho, p))

res = pd.DataFrame(records, columns=["module","gene","rho","pval"])
# FDR per module
res["FDR"] = np.nan
for m in res["module"].unique():
    idx = res["module"]==m
    res.loc[idx,"FDR"] = multipletests(res.loc[idx,"pval"], method="fdr_bh")[1]

# ----- save -----
OUTDIR.mkdir(exist_ok=True)
full_path = OUTDIR/"module_vs_dependency_strength_spearman.csv"
res.to_csv(full_path, index=False)
print(f"[saved] {full_path} (rows={len(res)})")

topk = (res.sort_values(["module","FDR","rho"], ascending=[True, True, False])
          .groupby("module").head(50).reset_index(drop=True))
top_path = OUTDIR/"module_vs_dependency_strength_TOP50_per_module.csv"
topk.to_csv(top_path, index=False)
print(f"[saved] {top_path}")

# ----- small key (ACH ↔ PDAC name) for readability -----
pd.Series({ach: ach_to_name.get(ach, ach) for ach in shared_ach}, name="PDAC_line_name")\
  .to_csv(OUTDIR/"ACH_to_name_key.csv")
print(f"[saved] {OUTDIR/'ACH_to_name_key.csv'}")

# ----- quick previews -----
def preview(mod, n=12):
    sub = topk[topk["module"]==mod].head(n)
    if sub.empty:
        print(f"{mod}: (no rows)")
    else:
        print(f"\n[{mod}] top {len(sub)} (ρ>0 ⇒ module-high lines more dependent):")
        print(sub[["gene","rho","FDR"]].to_string(index=False))

for m in ["SAT","IGE","IL2","MPC"]:
    preview(m, 12)

print("\nNOTE: We detected SAMPLES×GENES input, extracted ACH IDs from the **row index**,")
print("      kept the 30 shared ACH, transposed to GENES×ACH, and used −Chronos so ρ>0 = vulnerability.")


[align] Using 30 shared ACH lines.
[dep raw] (45, 17917)
[dep samples×genes] (45, 17916)
[dep filtered rows (ACH×genes)] (27, 17916)
[dep genes×ACH] (17916, 30)
[saved] out_depmap_corr/module_vs_dependency_strength_spearman.csv (rows=71664)
[saved] out_depmap_corr/module_vs_dependency_strength_TOP50_per_module.csv
[saved] out_depmap_corr/ACH_to_name_key.csv

[SAT] top 12 (ρ>0 ⇒ module-high lines more dependent):
   gene       rho      FDR
  STRN3  0.730159 0.137782
  CNPY3 -0.745421 0.137782
  KIF4B -0.693529 0.360975
MAB21L1  0.680098 0.426108
  KRT19  0.668498 0.437118
   ASPM -0.666667 0.437118
  OR1Q1 -0.691304 0.469361
   OCRL  0.636752 0.570898
  KRT31  0.634310 0.570898
  SYDE1  0.633700 0.570898
   TLE6  0.632479 0.570898
L3MBTL1  0.631258 0.570898

[IGE] top 12 (ρ>0 ⇒ module-high lines more dependent):
  gene       rho      FDR
  TBL3  0.754579 0.048723
 VPS4B -0.768010 0.048723
  HSCB  0.731380 0.072466
COL6A6  0.720391 0.072466
RAB39A  0.718559 0.072466
 CENPW -0.724664 0.07

In [14]:
# ===== Two-sided pathway analysis on dependency correlations (robust FDR handling) =====
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import gseapy as gp
from statsmodels.stats.multitest import multipletests

BASE    = Path(".")
INFILE  = BASE/"out_depmap_corr/module_vs_dependency_strength_spearman.csv"
OUTDIR  = BASE/"out_depmap_gsea"; OUTDIR.mkdir(exist_ok=True)

TOP_PLOT = 15
TOP_LE   = 10
TOPK_ORA = 300

# --- Discover available libraries and pick defaults ---
avail = set(gp.get_library_name())
def pick_lib(preferred, fallback_any=None):
    for kw in preferred:
        for lib in avail:
            if kw.lower() in lib.lower():
                return lib
    if fallback_any:
        for lib in avail:
            if fallback_any.lower() in lib.lower():
                return lib
    for lib in ["Reactome_2023","Reactome_2022","GO_Biological_Process_2023","KEGG_2021_Human"]:
        if lib in avail: return lib
    return next(iter(avail)) if avail else None

PRERANK_LIB = pick_lib(["MSigDB_Hallmark","Hallmark"])
ORA_LIB     = pick_lib(["Reactome_2023","Reactome_2022"], fallback_any="Reactome")
if PRERANK_LIB is None:
    raise RuntimeError("No gene-set libraries detected by gseapy.")
print(f"[libs] Using PRERANK_LIB={PRERANK_LIB}  |  ORA_LIB={ORA_LIB}")

# --- Load correlations ---
df = pd.read_csv(INFILE)
assert {"module","gene","rho"}.issubset(df.columns), "Input missing module/gene/rho."
modules = df["module"].dropna().unique().tolist()

def make_ranks(df_mod: pd.DataFrame) -> pd.Series:
    r = (df_mod.groupby("gene", as_index=True)["rho"].mean()).sort_values(ascending=False)
    # tiny deterministic tie-break to avoid duplicated-ranks warning
    tb = pd.Series(np.linspace(0, 1e-9, num=len(r), endpoint=False), index=r.index)
    return r + tb

def normalize_cols(res_df: pd.DataFrame) -> pd.DataFrame:
    # unify column names to snake_case lowercase
    res = res_df.copy()
    res.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in res.columns]
    return res

def ensure_fdr(res: pd.DataFrame) -> pd.DataFrame:
    # map common variants or compute from pval
    colmap = {
        "fdr_q_val": "fdr", "fdr_qval": "fdr", "fdr_q": "fdr",
        "fdr_q_value": "fdr", "fdr_q_value_adj": "fdr",
        "fdr_q_value_(bh)": "fdr", "adj_p_value": "fdr", "adjusted_p_value": "fdr",
    }
    res = res.copy()
    for src, dst in colmap.items():
        if src in res.columns and "fdr" not in res.columns:
            res["fdr"] = res[src]
            break
    if "fdr" not in res.columns:
        if "pval" in res.columns:
            res["fdr"] = multipletests(res["pval"].astype(float), method="fdr_bh")[1]
        elif "p_value" in res.columns:
            res["fdr"] = multipletests(res["p_value"].astype(float), method="fdr_bh")[1]
        else:
            # last resort: make a neutral column so downstream code runs
            res["fdr"] = np.nan
    return res

def extract_leading_edge(res_slice: pd.DataFrame) -> pd.DataFrame:
    out = []
    for term, row in res_slice.iterrows():
        for col in ["lead_genes","ledge_genes","le_genes","leading_edge"]:
            val = row.get(col, None)
            if isinstance(val, str) and val:
                genes = [g for g in val.split(",") if g]
                out.append({"term": term, "genes": ";".join(genes)})
                break
    return pd.DataFrame(out)

def preranked_module(module: str):
    sub = df[df["module"]==module].dropna(subset=["gene","rho"]).copy()
    ranks = make_ranks(sub)
    rnk_df = ranks.reset_index()
    rnk_df.columns = ["gene", "score"]

    pre = gp.prerank(
        rnk=rnk_df,
        gene_sets=PRERANK_LIB,
        threads=4,
        permutation_num=1000,
        min_size=10, max_size=500,
        outdir=None, seed=42, verbose=False,
    )
    res = normalize_cols(pre.res2d)
    res = ensure_fdr(res)

    # some gseapy versions keep gene set name as index, others as a column like 'term' or 'name'
    if "term" in res.columns and res.index.name is None:
        res = res.set_index("term")
    # Sort by FDR safely
    res = res.sort_values("fdr", ascending=True)

    out_csv = OUTDIR/f"{module}_GSEA_{PRERANK_LIB}.csv"
    res.to_csv(out_csv)
    print(f"[saved] {out_csv}")

    pos = res[res["nes"]>0].head(TOP_PLOT)
    neg = res[res["nes"]<0].head(TOP_PLOT)

    if not pos.empty:
        plt.figure(figsize=(7,5))
        plt.barh(pos.index[::-1], pos["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{module} — GSEA ({PRERANK_LIB}) top positive NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(OUTDIR/f"{module}_GSEA_{PRERANK_LIB}_topPOS.png", dpi=300)
        plt.close()

    if not neg.empty:
        plt.figure(figsize=(7,5))
        plt.barh(neg.index[::-1], neg["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{module} — GSEA ({PRERANK_LIB}) top negative NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(OUTDIR/f"{module}_GSEA_{PRERANK_LIB}_topNEG.png", dpi=300)
        plt.close()

    if not pos.empty:
        le_pos = extract_leading_edge(res.loc[pos.index].head(TOP_LE))
        if not le_pos.empty:
            le_pos.to_csv(OUTDIR/f"{module}_GSEA_{PRERANK_LIB}_leading_edge_POS.csv", index=False)
            print(f"[saved] {OUTDIR/f'{module}_GSEA_{PRERANK_LIB}_leading_edge_POS.csv'}")
    if not neg.empty:
        le_neg = extract_leading_edge(res.loc[neg.index].head(TOP_LE))
        if not le_neg.empty:
            le_neg.to_csv(OUTDIR/f"{module}_GSEA_{PRERANK_LIB}_leading_edge_NEG.csv", index=False)
            print(f"[saved] {OUTDIR/f'{module}_GSEA_{PRERANK_LIB}_leading_edge_NEG.csv'}")

    return ranks, res

def ora_on_tails(module: str, ranks: pd.Series):
    pos_genes = ranks.head(TOPK_ORA).index.tolist()
    neg_genes = ranks.tail(TOPK_ORA).index.tolist()

    enr_pos = gp.enrichr(gene_list=pos_genes, gene_sets=ORA_LIB, outdir=None, cutoff=1.0)
    res_pos = normalize_cols(enr_pos.results)
    if "adjusted_p_value" in res_pos.columns and "adjusted_p_value" != "adj_p_value":
        res_pos = res_pos.rename(columns={"adjusted_p_value":"adj_p_value"})
    res_pos.to_csv(OUTDIR/f"{module}_ORApos_{ORA_LIB}.csv", index=False)
    print(f"[saved] {OUTDIR/f'{module}_ORApos_{ORA_LIB}.csv'}")

    enr_neg = gp.enrichr(gene_list=neg_genes, gene_sets=ORA_LIB, outdir=None, cutoff=1.0)
    res_neg = normalize_cols(enr_neg.results)
    if "adjusted_p_value" in res_neg.columns and "adjusted_p_value" != "adj_p_value":
        res_neg = res_neg.rename(columns={"adjusted_p_value":"adj_p_value"})
    res_neg.to_csv(OUTDIR/f"{module}_ORAneg_{ORA_LIB}.csv", index=False)
    print(f"[saved] {OUTDIR/f'{module}_ORAneg_{ORA_LIB}.csv'}")

for m in modules:
    print(f"\n== {m} ==")
    ranks, _res = preranked_module(m)
    ora_on_tails(m, ranks)

print("\nDone. See:", OUTDIR.resolve())
print("Interpretation:")
print("  • Positive NES: pathways enriched among genes more essential in module-high lines (vulnerability biology).")
print("  • Negative NES: pathways enriched among genes less essential in module-high lines (buffering/escape).")
print("  • ORA on tails complements GSEA by focusing on strict top ρ>0 and ρ<0 gene sets.")


[libs] Using PRERANK_LIB=MSigDB_Hallmark_2020  |  ORA_LIB=Reactome_2022

== SAT ==
[saved] out_depmap_gsea/SAT_GSEA_MSigDB_Hallmark_2020.csv
[saved] out_depmap_gsea/SAT_GSEA_MSigDB_Hallmark_2020_leading_edge_POS.csv
[saved] out_depmap_gsea/SAT_GSEA_MSigDB_Hallmark_2020_leading_edge_NEG.csv
[saved] out_depmap_gsea/SAT_ORApos_Reactome_2022.csv
[saved] out_depmap_gsea/SAT_ORAneg_Reactome_2022.csv

== IGE ==
[saved] out_depmap_gsea/IGE_GSEA_MSigDB_Hallmark_2020.csv
[saved] out_depmap_gsea/IGE_GSEA_MSigDB_Hallmark_2020_leading_edge_POS.csv
[saved] out_depmap_gsea/IGE_GSEA_MSigDB_Hallmark_2020_leading_edge_NEG.csv
[saved] out_depmap_gsea/IGE_ORApos_Reactome_2022.csv
[saved] out_depmap_gsea/IGE_ORAneg_Reactome_2022.csv

== IL2 ==
[saved] out_depmap_gsea/IL2_GSEA_MSigDB_Hallmark_2020.csv
[saved] out_depmap_gsea/IL2_GSEA_MSigDB_Hallmark_2020_leading_edge_POS.csv
[saved] out_depmap_gsea/IL2_GSEA_MSigDB_Hallmark_2020_leading_edge_NEG.csv
[saved] out_depmap_gsea/IL2_ORApos_Reactome_2022.csv
[saved

In [15]:
# ===== Compare Hallmark vs Reactome vs BioPlanet for one module (default: IGE) =====
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import gseapy as gp
from statsmodels.stats.multitest import multipletests

# ---------- CONFIG ----------
MODULE      = "IGE"          # <-- change to "SAT", "IL2", "MPC" as needed
INFILE      = Path("out_depmap_corr/module_vs_dependency_strength_spearman.csv")
OUTROOT     = Path("out_depmap_gsea_compare") / MODULE
OUTROOT.mkdir(parents=True, exist_ok=True)

TOP_PLOT    = 15
MIN_SIZE    = 10
MAX_SIZE    = 500
N_PERM      = 1000
THREADS     = 4
SEED        = 42

# ---------- Helpers ----------
def make_ranks(df_mod: pd.DataFrame) -> pd.DataFrame:
    # collapse duplicates, sort desc by rho, add tiny deterministic tie-break
    r = (df_mod.groupby("gene", as_index=True)["rho"].mean()).sort_values(ascending=False)
    tb = pd.Series(np.linspace(0, 1e-9, num=len(r), endpoint=False), index=r.index)
    r = r + tb
    rnk_df = r.reset_index()
    rnk_df.columns = ["gene", "score"]
    return rnk_df

def normalize_cols(res_df: pd.DataFrame) -> pd.DataFrame:
    res = res_df.copy()
    res.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in res.columns]
    # set index to term if present
    if "term" in res.columns and res.index.name is None:
        res = res.set_index("term")
    return res

def ensure_fdr(res: pd.DataFrame) -> pd.DataFrame:
    res = res.copy()
    # map common variants to 'fdr'
    colmap = {
        "fdr_q_val":"fdr","fdr_qval":"fdr","fdr_q":"fdr","fdr_q_value":"fdr",
        "adj_p_value":"fdr","adjusted_p_value":"fdr"
    }
    for src,dst in colmap.items():
        if src in res.columns:
            res["fdr"] = res[src]
            break
    if "fdr" not in res.columns:
        pcol = "pval" if "pval" in res.columns else ("p_value" if "p_value" in res.columns else None)
        if pcol:
            res["fdr"] = multipletests(res[pcol].astype(float), method="fdr_bh")[1]
        else:
            res["fdr"] = np.nan
    return res

def run_prerank(rnk_df: pd.DataFrame, gene_set_name: str, outdir: Path):
    pre = gp.prerank(
        rnk=rnk_df, gene_sets=gene_set_name, threads=THREADS,
        permutation_num=N_PERM, min_size=MIN_SIZE, max_size=MAX_SIZE,
        outdir=None, seed=SEED, verbose=False
    )
    res = ensure_fdr(normalize_cols(pre.res2d))
    res = res.sort_values("fdr", ascending=True)
    # save full result
    csv_path = outdir / f"GSEA_{gene_set_name}.csv"
    res.to_csv(csv_path)
    # plots
    pos = res[res["nes"]>0].head(TOP_PLOT)
    neg = res[res["nes"]<0].head(TOP_PLOT)
    if not pos.empty:
        plt.figure(figsize=(7,5))
        plt.barh(pos.index[::-1], pos["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top positive NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topPOS.png", dpi=300)
        plt.close()
    if not neg.empty:
        plt.figure(figsize=(7,5))
        plt.barh(neg.index[::-1], neg["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top negative NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topNEG.png", dpi=300)
        plt.close()
    return res

# ---------- Pick valid library names on your install ----------
avail = set(gp.get_library_name())  # requires internet for first call; then cached
def pick_like(*candidates):
    for cand in candidates:
        for lib in avail:
            if cand.lower() in lib.lower():
                return lib
    return None

LIB_HALLMARK = pick_like("MSigDB_Hallmark", "Hallmark")
LIB_REACTOME = pick_like("Reactome_2023", "Reactome_2022", "Reactome")
LIB_BIOPLANT = pick_like("BioPlanet_2019", "BioPlanet")

libs = []
if LIB_HALLMARK: libs.append(LIB_HALLMARK)
if LIB_REACTOME: libs.append(LIB_REACTOME)
if LIB_BIOPLANT: libs.append(LIB_BIOPLANT)
if not libs:
    raise RuntimeError("No Hallmark/Reactome/BioPlanet libraries found by gseapy.get_library_name().")

print("[libs]", libs)

# ---------- Load correlations and build ranks for the chosen module ----------
corr = pd.read_csv(INFILE)
assert {"module","gene","rho"}.issubset(corr.columns), "Expected columns: module, gene, rho"
sub = corr[corr["module"]==MODULE].dropna(subset=["gene","rho"]).copy()
rnk_df = make_ranks(sub)

# ---------- Run GSEA for each library and collect a compact summary ----------
summary_rows = []
for lib in libs:
    outdir = OUTROOT / lib
    outdir.mkdir(parents=True, exist_ok=True)
    res = run_prerank(rnk_df, lib, outdir)
    # capture top positives (NES>0) and top negatives (NES<0)
    top_pos = res[res["nes"]>0].head(10).assign(direction="POS", library=lib)
    top_neg = res[res["nes"]<0].head(10).assign(direction="NEG", library=lib)
    summary_rows.append(top_pos[["library","nes","fdr"]].rename_axis("term"))
    summary_rows.append(top_neg[["library","nes","fdr"]].rename_axis("term"))

summary = pd.concat(summary_rows, axis=0)
summary_path = OUTROOT / f"{MODULE}_GSEA_summary_across_libraries.csv"
summary.to_csv(summary_path)
print(f"[saved] {summary_path}")

print("\nDone. Outputs per library in:", OUTROOT.resolve())
print("Interpretation:")
print("  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).")
print("  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).")
print("  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.")


[libs] ['MSigDB_Hallmark_2020', 'Reactome_2022', 'BioPlanet_2019']


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()


[saved] out_depmap_gsea_compare/IGE/IGE_GSEA_summary_across_libraries.csv

Done. Outputs per library in: /Users/scottpowers/My Drive/out_depmap_gsea_compare/IGE
Interpretation:
  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).
  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).
  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.


In [16]:
# ===== Compare Hallmark vs Reactome vs BioPlanet for one module (default: SAT) =====
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import gseapy as gp
from statsmodels.stats.multitest import multipletests

# ---------- CONFIG ----------
MODULE      = "SAT"          # <-- change to "SAT", "IL2", "MPC" as needed
INFILE      = Path("out_depmap_corr/module_vs_dependency_strength_spearman.csv")
OUTROOT     = Path("out_depmap_gsea_compare") / MODULE
OUTROOT.mkdir(parents=True, exist_ok=True)

TOP_PLOT    = 15
MIN_SIZE    = 10
MAX_SIZE    = 500
N_PERM      = 1000
THREADS     = 4
SEED        = 42

# ---------- Helpers ----------
def make_ranks(df_mod: pd.DataFrame) -> pd.DataFrame:
    # collapse duplicates, sort desc by rho, add tiny deterministic tie-break
    r = (df_mod.groupby("gene", as_index=True)["rho"].mean()).sort_values(ascending=False)
    tb = pd.Series(np.linspace(0, 1e-9, num=len(r), endpoint=False), index=r.index)
    r = r + tb
    rnk_df = r.reset_index()
    rnk_df.columns = ["gene", "score"]
    return rnk_df

def normalize_cols(res_df: pd.DataFrame) -> pd.DataFrame:
    res = res_df.copy()
    res.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in res.columns]
    # set index to term if present
    if "term" in res.columns and res.index.name is None:
        res = res.set_index("term")
    return res

def ensure_fdr(res: pd.DataFrame) -> pd.DataFrame:
    res = res.copy()
    # map common variants to 'fdr'
    colmap = {
        "fdr_q_val":"fdr","fdr_qval":"fdr","fdr_q":"fdr","fdr_q_value":"fdr",
        "adj_p_value":"fdr","adjusted_p_value":"fdr"
    }
    for src,dst in colmap.items():
        if src in res.columns:
            res["fdr"] = res[src]
            break
    if "fdr" not in res.columns:
        pcol = "pval" if "pval" in res.columns else ("p_value" if "p_value" in res.columns else None)
        if pcol:
            res["fdr"] = multipletests(res[pcol].astype(float), method="fdr_bh")[1]
        else:
            res["fdr"] = np.nan
    return res

def run_prerank(rnk_df: pd.DataFrame, gene_set_name: str, outdir: Path):
    pre = gp.prerank(
        rnk=rnk_df, gene_sets=gene_set_name, threads=THREADS,
        permutation_num=N_PERM, min_size=MIN_SIZE, max_size=MAX_SIZE,
        outdir=None, seed=SEED, verbose=False
    )
    res = ensure_fdr(normalize_cols(pre.res2d))
    res = res.sort_values("fdr", ascending=True)
    # save full result
    csv_path = outdir / f"GSEA_{gene_set_name}.csv"
    res.to_csv(csv_path)
    # plots
    pos = res[res["nes"]>0].head(TOP_PLOT)
    neg = res[res["nes"]<0].head(TOP_PLOT)
    if not pos.empty:
        plt.figure(figsize=(7,5))
        plt.barh(pos.index[::-1], pos["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top positive NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topPOS.png", dpi=300)
        plt.close()
    if not neg.empty:
        plt.figure(figsize=(7,5))
        plt.barh(neg.index[::-1], neg["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top negative NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topNEG.png", dpi=300)
        plt.close()
    return res

# ---------- Pick valid library names on your install ----------
avail = set(gp.get_library_name())  # requires internet for first call; then cached
def pick_like(*candidates):
    for cand in candidates:
        for lib in avail:
            if cand.lower() in lib.lower():
                return lib
    return None

LIB_HALLMARK = pick_like("MSigDB_Hallmark", "Hallmark")
LIB_REACTOME = pick_like("Reactome_2023", "Reactome_2022", "Reactome")
LIB_BIOPLANT = pick_like("BioPlanet_2019", "BioPlanet")

libs = []
if LIB_HALLMARK: libs.append(LIB_HALLMARK)
if LIB_REACTOME: libs.append(LIB_REACTOME)
if LIB_BIOPLANT: libs.append(LIB_BIOPLANT)
if not libs:
    raise RuntimeError("No Hallmark/Reactome/BioPlanet libraries found by gseapy.get_library_name().")

print("[libs]", libs)

# ---------- Load correlations and build ranks for the chosen module ----------
corr = pd.read_csv(INFILE)
assert {"module","gene","rho"}.issubset(corr.columns), "Expected columns: module, gene, rho"
sub = corr[corr["module"]==MODULE].dropna(subset=["gene","rho"]).copy()
rnk_df = make_ranks(sub)

# ---------- Run GSEA for each library and collect a compact summary ----------
summary_rows = []
for lib in libs:
    outdir = OUTROOT / lib
    outdir.mkdir(parents=True, exist_ok=True)
    res = run_prerank(rnk_df, lib, outdir)
    # capture top positives (NES>0) and top negatives (NES<0)
    top_pos = res[res["nes"]>0].head(10).assign(direction="POS", library=lib)
    top_neg = res[res["nes"]<0].head(10).assign(direction="NEG", library=lib)
    summary_rows.append(top_pos[["library","nes","fdr"]].rename_axis("term"))
    summary_rows.append(top_neg[["library","nes","fdr"]].rename_axis("term"))

summary = pd.concat(summary_rows, axis=0)
summary_path = OUTROOT / f"{MODULE}_GSEA_summary_across_libraries.csv"
summary.to_csv(summary_path)
print(f"[saved] {summary_path}")

print("\nDone. Outputs per library in:", OUTROOT.resolve())
print("Interpretation:")
print("  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).")
print("  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).")
print("  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.")


[libs] ['MSigDB_Hallmark_2020', 'Reactome_2022', 'BioPlanet_2019']
[saved] out_depmap_gsea_compare/SAT/SAT_GSEA_summary_across_libraries.csv

Done. Outputs per library in: /Users/scottpowers/My Drive/out_depmap_gsea_compare/SAT
Interpretation:
  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).
  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).
  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.


In [17]:
# ===== Compare Hallmark vs Reactome vs BioPlanet for one module (default: IL2) =====
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import gseapy as gp
from statsmodels.stats.multitest import multipletests

# ---------- CONFIG ----------
MODULE      = "IL2"          # <-- change to "IL2", "IL2", "MPC" as needed
INFILE      = Path("out_depmap_corr/module_vs_dependency_strength_spearman.csv")
OUTROOT     = Path("out_depmap_gsea_compare") / MODULE
OUTROOT.mkdir(parents=True, exist_ok=True)

TOP_PLOT    = 15
MIN_SIZE    = 10
MAX_SIZE    = 500
N_PERM      = 1000
THREADS     = 4
SEED        = 42

# ---------- Helpers ----------
def make_ranks(df_mod: pd.DataFrame) -> pd.DataFrame:
    # collapse duplicates, sort desc by rho, add tiny deterministic tie-break
    r = (df_mod.groupby("gene", as_index=True)["rho"].mean()).sort_values(ascending=False)
    tb = pd.Series(np.linspace(0, 1e-9, num=len(r), endpoint=False), index=r.index)
    r = r + tb
    rnk_df = r.reset_index()
    rnk_df.columns = ["gene", "score"]
    return rnk_df

def normalize_cols(res_df: pd.DataFrame) -> pd.DataFrame:
    res = res_df.copy()
    res.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in res.columns]
    # set index to term if present
    if "term" in res.columns and res.index.name is None:
        res = res.set_index("term")
    return res

def ensure_fdr(res: pd.DataFrame) -> pd.DataFrame:
    res = res.copy()
    # map common variants to 'fdr'
    colmap = {
        "fdr_q_val":"fdr","fdr_qval":"fdr","fdr_q":"fdr","fdr_q_value":"fdr",
        "adj_p_value":"fdr","adjusted_p_value":"fdr"
    }
    for src,dst in colmap.items():
        if src in res.columns:
            res["fdr"] = res[src]
            break
    if "fdr" not in res.columns:
        pcol = "pval" if "pval" in res.columns else ("p_value" if "p_value" in res.columns else None)
        if pcol:
            res["fdr"] = multipletests(res[pcol].astype(float), method="fdr_bh")[1]
        else:
            res["fdr"] = np.nan
    return res

def run_prerank(rnk_df: pd.DataFrame, gene_set_name: str, outdir: Path):
    pre = gp.prerank(
        rnk=rnk_df, gene_sets=gene_set_name, threads=THREADS,
        permutation_num=N_PERM, min_size=MIN_SIZE, max_size=MAX_SIZE,
        outdir=None, seed=SEED, verbose=False
    )
    res = ensure_fdr(normalize_cols(pre.res2d))
    res = res.sort_values("fdr", ascending=True)
    # save full result
    csv_path = outdir / f"GSEA_{gene_set_name}.csv"
    res.to_csv(csv_path)
    # plots
    pos = res[res["nes"]>0].head(TOP_PLOT)
    neg = res[res["nes"]<0].head(TOP_PLOT)
    if not pos.empty:
        plt.figure(figsize=(7,5))
        plt.barh(pos.index[::-1], pos["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top positive NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topPOS.png", dpi=300)
        plt.close()
    if not neg.empty:
        plt.figure(figsize=(7,5))
        plt.barh(neg.index[::-1], neg["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top negative NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topNEG.png", dpi=300)
        plt.close()
    return res

# ---------- Pick valid library names on your install ----------
avail = set(gp.get_library_name())  # requires internet for first call; then cached
def pick_like(*candidates):
    for cand in candidates:
        for lib in avail:
            if cand.lower() in lib.lower():
                return lib
    return None

LIB_HALLMARK = pick_like("MSigDB_Hallmark", "Hallmark")
LIB_REACTOME = pick_like("Reactome_2023", "Reactome_2022", "Reactome")
LIB_BIOPLANT = pick_like("BioPlanet_2019", "BioPlanet")

libs = []
if LIB_HALLMARK: libs.append(LIB_HALLMARK)
if LIB_REACTOME: libs.append(LIB_REACTOME)
if LIB_BIOPLANT: libs.append(LIB_BIOPLANT)
if not libs:
    raise RuntimeError("No Hallmark/Reactome/BioPlanet libraries found by gseapy.get_library_name().")

print("[libs]", libs)

# ---------- Load correlations and build ranks for the chosen module ----------
corr = pd.read_csv(INFILE)
assert {"module","gene","rho"}.issubset(corr.columns), "Expected columns: module, gene, rho"
sub = corr[corr["module"]==MODULE].dropna(subset=["gene","rho"]).copy()
rnk_df = make_ranks(sub)

# ---------- Run GSEA for each library and collect a compact summary ----------
summary_rows = []
for lib in libs:
    outdir = OUTROOT / lib
    outdir.mkdir(parents=True, exist_ok=True)
    res = run_prerank(rnk_df, lib, outdir)
    # capture top positives (NES>0) and top negatives (NES<0)
    top_pos = res[res["nes"]>0].head(10).assign(direction="POS", library=lib)
    top_neg = res[res["nes"]<0].head(10).assign(direction="NEG", library=lib)
    summary_rows.append(top_pos[["library","nes","fdr"]].rename_axis("term"))
    summary_rows.append(top_neg[["library","nes","fdr"]].rename_axis("term"))

summary = pd.concat(summary_rows, axis=0)
summary_path = OUTROOT / f"{MODULE}_GSEA_summary_across_libraries.csv"
summary.to_csv(summary_path)
print(f"[saved] {summary_path}")

print("\nDone. Outputs per library in:", OUTROOT.resolve())
print("Interpretation:")
print("  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).")
print("  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).")
print("  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.")


[libs] ['MSigDB_Hallmark_2020', 'Reactome_2022', 'BioPlanet_2019']


  plt.tight_layout()
  plt.tight_layout()


[saved] out_depmap_gsea_compare/IL2/IL2_GSEA_summary_across_libraries.csv

Done. Outputs per library in: /Users/scottpowers/My Drive/out_depmap_gsea_compare/IL2
Interpretation:
  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).
  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).
  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.


In [18]:
# ===== Compare Hallmark vs Reactome vs BioPlanet for one module (default: MPC) =====
import pandas as pd, numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import gseapy as gp
from statsmodels.stats.multitest import multipletests

# ---------- CONFIG ----------
MODULE      = "MPC"          # <-- change to "MPC", "IL2", "MPC" as needed
INFILE      = Path("out_depmap_corr/module_vs_dependency_strength_spearman.csv")
OUTROOT     = Path("out_depmap_gsea_compare") / MODULE
OUTROOT.mkdir(parents=True, exist_ok=True)

TOP_PLOT    = 15
MIN_SIZE    = 10
MAX_SIZE    = 500
N_PERM      = 1000
THREADS     = 4
SEED        = 42

# ---------- Helpers ----------
def make_ranks(df_mod: pd.DataFrame) -> pd.DataFrame:
    # collapse duplicates, sort desc by rho, add tiny deterministic tie-break
    r = (df_mod.groupby("gene", as_index=True)["rho"].mean()).sort_values(ascending=False)
    tb = pd.Series(np.linspace(0, 1e-9, num=len(r), endpoint=False), index=r.index)
    r = r + tb
    rnk_df = r.reset_index()
    rnk_df.columns = ["gene", "score"]
    return rnk_df

def normalize_cols(res_df: pd.DataFrame) -> pd.DataFrame:
    res = res_df.copy()
    res.columns = [c.strip().lower().replace(" ", "_").replace("-", "_") for c in res.columns]
    # set index to term if present
    if "term" in res.columns and res.index.name is None:
        res = res.set_index("term")
    return res

def ensure_fdr(res: pd.DataFrame) -> pd.DataFrame:
    res = res.copy()
    # map common variants to 'fdr'
    colmap = {
        "fdr_q_val":"fdr","fdr_qval":"fdr","fdr_q":"fdr","fdr_q_value":"fdr",
        "adj_p_value":"fdr","adjusted_p_value":"fdr"
    }
    for src,dst in colmap.items():
        if src in res.columns:
            res["fdr"] = res[src]
            break
    if "fdr" not in res.columns:
        pcol = "pval" if "pval" in res.columns else ("p_value" if "p_value" in res.columns else None)
        if pcol:
            res["fdr"] = multipletests(res[pcol].astype(float), method="fdr_bh")[1]
        else:
            res["fdr"] = np.nan
    return res

def run_prerank(rnk_df: pd.DataFrame, gene_set_name: str, outdir: Path):
    pre = gp.prerank(
        rnk=rnk_df, gene_sets=gene_set_name, threads=THREADS,
        permutation_num=N_PERM, min_size=MIN_SIZE, max_size=MAX_SIZE,
        outdir=None, seed=SEED, verbose=False
    )
    res = ensure_fdr(normalize_cols(pre.res2d))
    res = res.sort_values("fdr", ascending=True)
    # save full result
    csv_path = outdir / f"GSEA_{gene_set_name}.csv"
    res.to_csv(csv_path)
    # plots
    pos = res[res["nes"]>0].head(TOP_PLOT)
    neg = res[res["nes"]<0].head(TOP_PLOT)
    if not pos.empty:
        plt.figure(figsize=(7,5))
        plt.barh(pos.index[::-1], pos["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top positive NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topPOS.png", dpi=300)
        plt.close()
    if not neg.empty:
        plt.figure(figsize=(7,5))
        plt.barh(neg.index[::-1], neg["nes"][::-1])
        plt.axvline(0, ls="--", lw=1)
        plt.title(f"{MODULE} — {gene_set_name} top negative NES")
        plt.xlabel("Normalized Enrichment Score (NES)")
        plt.tight_layout()
        plt.savefig(outdir / f"{gene_set_name}_topNEG.png", dpi=300)
        plt.close()
    return res

# ---------- Pick valid library names on your install ----------
avail = set(gp.get_library_name())  # requires internet for first call; then cached
def pick_like(*candidates):
    for cand in candidates:
        for lib in avail:
            if cand.lower() in lib.lower():
                return lib
    return None

LIB_HALLMARK = pick_like("MSigDB_Hallmark", "Hallmark")
LIB_REACTOME = pick_like("Reactome_2023", "Reactome_2022", "Reactome")
LIB_BIOPLANT = pick_like("BioPlanet_2019", "BioPlanet")

libs = []
if LIB_HALLMARK: libs.append(LIB_HALLMARK)
if LIB_REACTOME: libs.append(LIB_REACTOME)
if LIB_BIOPLANT: libs.append(LIB_BIOPLANT)
if not libs:
    raise RuntimeError("No Hallmark/Reactome/BioPlanet libraries found by gseapy.get_library_name().")

print("[libs]", libs)

# ---------- Load correlations and build ranks for the chosen module ----------
corr = pd.read_csv(INFILE)
assert {"module","gene","rho"}.issubset(corr.columns), "Expected columns: module, gene, rho"
sub = corr[corr["module"]==MODULE].dropna(subset=["gene","rho"]).copy()
rnk_df = make_ranks(sub)

# ---------- Run GSEA for each library and collect a compact summary ----------
summary_rows = []
for lib in libs:
    outdir = OUTROOT / lib
    outdir.mkdir(parents=True, exist_ok=True)
    res = run_prerank(rnk_df, lib, outdir)
    # capture top positives (NES>0) and top negatives (NES<0)
    top_pos = res[res["nes"]>0].head(10).assign(direction="POS", library=lib)
    top_neg = res[res["nes"]<0].head(10).assign(direction="NEG", library=lib)
    summary_rows.append(top_pos[["library","nes","fdr"]].rename_axis("term"))
    summary_rows.append(top_neg[["library","nes","fdr"]].rename_axis("term"))

summary = pd.concat(summary_rows, axis=0)
summary_path = OUTROOT / f"{MODULE}_GSEA_summary_across_libraries.csv"
summary.to_csv(summary_path)
print(f"[saved] {summary_path}")

print("\nDone. Outputs per library in:", OUTROOT.resolve())
print("Interpretation:")
print("  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).")
print("  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).")
print("  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.")


[libs] ['MSigDB_Hallmark_2020', 'Reactome_2022', 'BioPlanet_2019']


  plt.tight_layout()
  plt.tight_layout()


[saved] out_depmap_gsea_compare/MPC/MPC_GSEA_summary_across_libraries.csv

Done. Outputs per library in: /Users/scottpowers/My Drive/out_depmap_gsea_compare/MPC
Interpretation:
  • Positive NES ⇒ pathways enriched among genes more essential in module-high lines (putative vulnerabilities).
  • Negative NES ⇒ pathways enriched among genes less essential in module-high lines (buffering/escape).
  • Hallmark gives clean overview; Reactome and BioPlanet add targetable, mechanistic detail.


In [19]:
import pandas as pd
from itertools import combinations

os.chdir("/Users/scottpowers/Desktop")

# --- load & clean ---
df = pd.read_csv("module_correlations.csv", skipinitialspace=True)  # comma-separated
df.columns = df.columns.str.strip().str.lower()

# keep only rows that satisfy your dependency criterion
crit = (df['rho'] < 0) & (df['pval'] < 0.05)
neg = df.loc[crit, ['module','gene']].drop_duplicates()

# set of genes per module
mods = ['IGE','SAT','IL2','MPC']
S = {m: set(neg.loc[neg.module==m, 'gene']) for m in mods}

# --- pairwise overlaps & Jaccard ---
pair_rows = []
for a,b in combinations(mods, 2):
    inter = S[a] & S[b]
    union = S[a] | S[b]
    pair_rows.append({
        'pair': f'{a}∩{b}',
        'overlap_n': len(inter),
        'size_'+a: len(S[a]),
        'size_'+b: len(S[b]),
        'jaccard': (len(inter)/len(union)) if union else 0.0
    })
pair_df = pd.DataFrame(pair_rows).sort_values('overlap_n', ascending=False)

# --- triple overlaps ---
tri_rows = []
for a,b,c in combinations(mods, 3):
    inter = S[a] & S[b] & S[c]
    tri_rows.append({'triple': f'{a}∩{b}∩{c}', 'overlap_n': len(inter)})
tri_df = pd.DataFrame(tri_rows).sort_values('overlap_n', ascending=False)

# --- genes shared by >= 2 modules ---
counts = (neg.assign(val=1)
            .pivot_table(index='gene', columns='module', values='val', fill_value=0)
            .reindex(columns=mods, fill_value=0))
counts['n_modules'] = counts.sum(1)
multi = counts[counts['n_modules']>=2].sort_values(['n_modules']+mods, ascending=False)

# save detailed outputs
counts.to_csv("dependency_flags_by_gene.csv")  # 1/0 flags per module + n_modules
pair_df.to_csv("pairwise_dependency_overlap.csv", index=False)
tri_df.to_csv("triple_dependency_overlap.csv", index=False)
multi.to_csv("genes_shared_in_2plus_modules.csv")

# quick text previews
print("Pairwise overlaps:\n", pair_df.head(10), "\n")
print("Triple overlaps:\n", tri_df, "\n")
print("Top genes present in >=2 modules:\n", multi.head(25))


Pairwise overlaps:
       pair  overlap_n  size_IGE  size_SAT   jaccard  size_IL2  size_MPC
4  SAT∩MPC         74       NaN     458.0  0.081140       NaN     528.0
3  SAT∩IL2         66       NaN     458.0  0.084942     385.0       NaN
5  IL2∩MPC         48       NaN       NaN  0.055491     385.0     528.0
0  IGE∩SAT          8     622.0     458.0  0.007463       NaN       NaN
1  IGE∩IL2          3     622.0       NaN  0.002988     385.0       NaN
2  IGE∩MPC          0     622.0       NaN  0.000000       NaN     528.0 

Triple overlaps:
         triple  overlap_n
3  SAT∩IL2∩MPC         10
0  IGE∩SAT∩IL2          1
1  IGE∩SAT∩MPC          0
2  IGE∩IL2∩MPC          0 

Top genes present in >=2 modules:
 module    IGE  SAT  IL2  MPC  n_modules
gene                                   
TBC1D7    1.0  1.0  1.0  0.0        3.0
ACO2      0.0  1.0  1.0  1.0        3.0
ARFGAP1   0.0  1.0  1.0  1.0        3.0
ASB11     0.0  1.0  1.0  1.0        3.0
ASPM      0.0  1.0  1.0  1.0        3.0
C11orf16 

In [20]:
# ==== Fig 6A — Overlap of dependency genes from shared_dependencies.csv (LONG format) ====
# CONFIG
PATH = "shared_dependencies.csv"          # long table with columns: module,gene,rho,pval,FDR
MODULES = ["SAT","IGE","IL2","MPC"]       # order will be preserved in the plot

HIT_METHOD = "fdr"    # "fdr" or "topn"
Q_MAX   = 0.10        # used if HIT_METHOD == "fdr"
ABS_R   = 0.20        # used if HIT_METHOD == "fdr"
TOP_N   = 100         # used if HIT_METHOD == "topn" or as fallback when no FDR hits

OUT = "Fig6A_UpSet"
TOP_INTERSECTIONS = 15

# ---------------------------------------------------
import pandas as pd, numpy as np, itertools, re

# Use a headless backend to avoid GUI/freetype issues
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

def _clean_symbol(s):
    s = str(s)
    s = s.replace("\u00A0"," ").replace("\u2009"," ").strip()
    s = re.sub(r"\s+", "", s)  # drop internal spaces: e.g., "SOS 1" -> "SOS1"
    return s.upper()

# ---- load & standardize ----
df = pd.read_csv(PATH)
need = ["module","gene","rho","pval","FDR"]
missing = [c for c in need if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns {missing}; found {list(df.columns)}")

df["module"] = df["module"].astype(str).str.upper().str.strip()
df["gene"]   = df["gene"].map(_clean_symbol)
for c in ["rho","pval","FDR"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df[df["module"].isin([m.upper() for m in MODULES])].dropna(subset=["gene","rho"])

# ---- call hits per module ----
hits = {}
for mod in MODULES:
    sub = df[df["module"] == mod].dropna(subset=["rho"])
    if HIT_METHOD.lower() == "fdr":
        kept = sub[(sub["FDR"] <= Q_MAX) & (sub["rho"].abs() >= ABS_R)]
        if kept.empty:
            kept = sub.sort_values("rho", key=lambda s: s.abs(), ascending=False).head(TOP_N)
    elif HIT_METHOD.lower() == "topn":
        kept = sub.sort_values("rho", key=lambda s: s.abs(), ascending=False).head(TOP_N)
    else:
        raise ValueError("HIT_METHOD must be 'fdr' or 'topn'")
    genes = sorted(set(kept["gene"]))
    hits[mod] = set(genes)
    pd.Series(genes).to_csv(f"{mod}_hits.txt", index=False, header=False)

# ---- wide 0/1 table ----
all_genes = sorted(set().union(*hits.values())) if hits else []
wide = pd.DataFrame({"gene": all_genes})
for mod in MODULES:
    wide[mod] = wide["gene"].isin(hits.get(mod, set())).astype(int)
wide.to_csv("dependency_hits.csv", index=False)

# ---- membership + exclusive intersections (UpSet-style) ----
mdf = wide.set_index("gene")[MODULES].astype(bool)

def exclusive_intersections(mdf_bool):
    cols = list(mdf_bool.columns)
    rows = []
    for r in range(1, len(cols)+1):
        for comb in itertools.combinations(cols, r):
            mask = mdf_bool[list(comb)].all(axis=1)
            others = [c for c in cols if c not in comb]
            if others:
                mask &= ~mdf_bool[others].any(axis=1)
            rows.append({"intersection":"+".join(comb), "k":r, "count":int(mask.sum())})
    return pd.DataFrame(rows).sort_values(["count","k","intersection"], ascending=[False,True,True])

inter_df = exclusive_intersections(mdf)
inter_df.to_csv(f"{OUT}_intersections_all.tsv", sep="\t", index=False)
inter_top = inter_df.head(TOP_INTERSECTIONS)
inter_top.to_csv(f"{OUT}_intersections_top{TOP_INTERSECTIONS}.tsv", sep="\t", index=False)

# ---- plot (bars + dot matrix + per-set sizes) ----
set_sizes = mdf.sum(axis=0).astype(int).sort_values(ascending=False)

fig = plt.figure(figsize=(8,6), constrained_layout=False)
gs = fig.add_gridspec(nrows=4, ncols=5,
                      height_ratios=[5,0.4,1.8,0.2],
                      width_ratios=[5,0.6,0.6,0.6,2.0])

ax_bar  = fig.add_subplot(gs[0, 0:4])
ax_dots = fig.add_subplot(gs[2, 0:4])
ax_set  = fig.add_subplot(gs[0:3, 4])

# Bars
x = np.arange(len(inter_top))
ax_bar.bar(x, inter_top["count"].values)
ax_bar.set_ylabel("Genes in intersection")
ax_bar.set_xticks([])
ax_bar.yaxis.set_major_locator(MaxNLocator(integer=True))
ax_bar.set_title("Dependency Gene Overlap — PDAC Modules", pad=10)
ymax = inter_top["count"].max() if len(inter_top) else 0
for xi, val in zip(x, inter_top["count"].values):
    ax_bar.text(xi, val + max(1, 0.02*max(1, ymax)), str(val),
                ha="center", va="bottom", fontsize=9)

# Dot matrix
sets = MODULES[:]  # preserve order
k = len(sets)
mat = np.zeros((k, len(inter_top)), dtype=int)
for col_i, inter_name in enumerate(inter_top["intersection"].tolist()):
    active = inter_name.split("+")
    for row_i, s in enumerate(sets):
        mat[row_i, col_i] = 1 if s in active else 0
for row_i in range(k):
    where = mat[row_i]==1
    ax_dots.scatter(x[where], np.full(where.sum(), k-1-row_i), s=60)
    for col_i in range(len(x)):
        if mat[row_i, col_i]==1:
            active_rows = [ri for ri in range(k) if mat[ri, col_i]==1]
            if active_rows:
                ax_dots.plot([col_i,col_i], [k-1-active_rows[0], k-1-active_rows[-1]], linewidth=1)
ax_dots.set_yticks(range(k)); ax_dots.set_yticklabels(list(reversed(sets)))
ax_dots.set_xlabel("Intersections (top by size)")
ax_dots.set_xlim(-0.5, len(x)-0.5)
ax_dots.set_xticks(range(len(x)))
ax_dots.set_xticklabels(inter_top["intersection"].tolist(), rotation=45, ha="right", fontsize=8)
ax_dots.set_ylim(-0.5, k-0.5)

# Per-set sizes
sizes_sorted = set_sizes
y = np.arange(len(sizes_sorted))
ax_set.barh(y, sizes_sorted.values)
ax_set.set_yticks(y); ax_set.set_yticklabels(sizes_sorted.index.tolist())
ax_set.invert_yaxis()
ax_set.set_xlabel("Genes per module")
ax_set.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.tight_layout()
# Always save PNG; try PDF but don't crash if PDF backend is broken
fig.savefig(f"{OUT}.png", dpi=300, bbox_inches="tight")
try:
    fig.savefig(f"{OUT}.pdf", dpi=300, bbox_inches="tight")
except Exception as e:
    print(f"⚠️ PDF save failed; PNG written. Reason: {e}")
plt.close(fig)

print("✅ Fig 6A built")
print(f"  - {OUT}.png (+ PDF if available)")
print("  - dependency_hits.csv + per-module *_hits.txt")
print(f"  - {OUT}_intersections_all.tsv + _top{TOP_INTERSECTIONS}.tsv")
print("Hit method:", HIT_METHOD, "| Q_MAX:", Q_MAX, "ABS_R:", ABS_R, "TOP_N:", TOP_N)


✅ Fig 6A built
  - Fig6A_UpSet.png (+ PDF if available)
  - dependency_hits.csv + per-module *_hits.txt
  - Fig6A_UpSet_intersections_all.tsv + _top15.tsv
Hit method: fdr | Q_MAX: 0.1 ABS_R: 0.2 TOP_N: 100


In [21]:
# ==== Fig 6B — Pairwise overlap (Jaccard + Fisher exact), diagonal grayed & excluded from scales ====
IN   = "dependency_hits.csv"
OUT  = "Fig6B_Overlap"
MODULES = ["SAT","IGE","IL2","MPC"]  # keep same order as Fig 6A

import pandas as pd, numpy as np
# Headless backend
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ---------- load ----------
wide = pd.read_csv(IN)
wide["gene"] = wide["gene"].astype(str)
for m in MODULES:
    if m not in wide.columns:
        wide[m] = 0
wide = wide[["gene"] + MODULES].copy()
wide[MODULES] = wide[MODULES].astype(int)

# ---------- helpers ----------
def jaccard(a, b):
    inter = int(((a==1)&(b==1)).sum())
    union = int(((a==1)|(b==1)).sum())
    return (inter / union) if union > 0 else 0.0, inter

def fishers_exact(a, b):
    a1 = int((a==1).sum()); b1 = int((b==1).sum())
    a0 = len(a) - a1;       b0 = len(b) - b1
    a1b1 = int(((a==1)&(b==1)).sum())
    a1b0 = a1 - a1b1
    a0b1 = b1 - a1b1
    a0b0 = len(a) - (a1b1 + a1b0 + a0b1)
    try:
        from scipy.stats import fisher_exact
        _, p = fisher_exact([[a1b1, a1b0],[a0b1, a0b0]], alternative="two-sided")
    except Exception:
        # hypergeometric tail fallback
        N = len(a)
        from math import comb
        def logC(n,k): return np.log(comb(n,k))
        def pmf(x):
            return np.exp(logC(b1,x)+logC(N-b1,a1-x)-logC(N,a1))
        x_obs = a1b1
        support = range(max(0, a1+b1-N), min(a1,b1)+1)
        p_obs = pmf(x_obs)
        p = sum(pmf(x) for x in support if pmf(x) <= p_obs)
        p = min(1.0, float(p))
    return p, a1b1

# ---------- compute matrices ----------
cols = MODULES
k = len(cols)
J    = np.zeros((k,k), dtype=float)   # Jaccard
Nint = np.zeros((k,k), dtype=int)     # |A∩B|
Q    = np.ones((k,k), dtype=float)    # BH-FDR q

# raw p for BH
pairs = []
for i, mi in enumerate(cols):
    ai = wide[mi].values
    for j, mj in enumerate(cols):
        aj = wide[mj].values
        jv, inter = jaccard(ai, aj)
        J[i,j] = jv
        Nint[i,j] = inter
        if i < j:
            p, _ = fishers_exact(ai, aj)
            pairs.append(((i,j), p))

# BH-FDR on upper triangle
if pairs:
    idx, pvals = zip(*pairs)
    pvals = np.array(pvals, float)
    order = np.argsort(pvals)
    m = len(pvals)
    ranks = np.arange(1, m+1)[np.argsort(order)]
    q_sorted = np.minimum.accumulate((pvals[order] * m / np.arange(1, m+1))[::-1])[::-1]
    q = np.empty_like(pvals); q[order] = q_sorted
    for (i,j), qv in zip(idx, q):
        Q[i,j] = Q[j,i] = qv
np.fill_diagonal(Q, np.nan)  # diagonal ignored

# ---------- plotting ----------
fig, axes = plt.subplots(1, 2, figsize=(8,4.8))
(ax1, ax2) = axes

# Panel 1: Jaccard (diagonal gray & excluded from vmin/vmax)
J_plot = J.copy().astype(float)
np.fill_diagonal(J_plot, np.nan)
offdiag_J = J_plot[~np.isnan(J_plot)]
vmin1, vmax1 = 0.0, (offdiag_J.max() if offdiag_J.size else 1.0)

cmap1 = plt.cm.viridis.copy()
cmap1.set_bad(color="lightgray")  # diagonal
im1 = ax1.imshow(J_plot, vmin=vmin1, vmax=vmax1, cmap=cmap1)
ax1.set_xticks(range(k)); ax1.set_yticks(range(k))
ax1.set_xticklabels(cols); ax1.set_yticklabels(cols)
ax1.set_title("Jaccard overlap")
# annotate with |∩| counts except diagonal
for i in range(k):
    for j in range(k):
        if i != j:
            ax1.text(j, i, f"{Nint[i,j]}", ha="center", va="center", fontsize=9)

cbar1 = fig.colorbar(im1, ax=ax1, fraction=0.046, pad=0.04)
cbar1.set_label("Jaccard index")

# Panel 2: Fisher −log10(q) (diagonal gray & excluded from scale)
neglogQ = -np.log10(np.clip(Q, 1e-300, 1.0))
np.fill_diagonal(neglogQ, np.nan)
offdiag_neglog = neglogQ[~np.isnan(neglogQ)]
vmin2, vmax2 = 0.0, (offdiag_neglog.max() if offdiag_neglog.size else 1.0)

cmap2 = plt.cm.magma.copy()
cmap2.set_bad(color="lightgray")  # diagonal
im2 = ax2.imshow(neglogQ, vmin=vmin2, vmax=vmax2, cmap=cmap2)
ax2.set_xticks(range(k)); ax2.set_yticks(range(k))
ax2.set_xticklabels(cols); ax2.set_yticklabels(cols)
ax2.set_title("Fisher overlap (−log10 q)")

# significance stars (skip diagonal)
def stars(q):
    if np.isnan(q): return ""
    if q < 0.001: return "***"
    if q < 0.01:  return "**"
    if q < 0.05:  return "*"
    return ""

for i in range(k):
    for j in range(k):
        s = stars(Q[i,j])
        if s:
            ax2.text(j, i, s, ha="center", va="center", fontsize=12, fontweight="bold")

cbar2 = fig.colorbar(im2, ax=ax2, fraction=0.046, pad=0.04)
cbar2.set_label("−log10(q)")

plt.suptitle("Dependency Gene Overlap: Pairwise Metrics", y=0.98)
plt.tight_layout(rect=[0,0,1,0.95])

fig.savefig(f"{OUT}.png", dpi=300, bbox_inches="tight")
try:
    fig.savefig(f"{OUT}.pdf", dpi=300, bbox_inches="tight")
except Exception as e:
    print(f"⚠️ PDF save failed; PNG written. Reason: {e}")
plt.close(fig)

print("✅ Fig 6B saved as", f"{OUT}.png (+ PDF if available)")


✅ Fig 6B saved as Fig6B_Overlap.png (+ PDF if available)


In [22]:
# ==== Fig 6B (Clustered) — robust dendrograms (seaborn first, matplotlib fallback) ====
# Input: shared_dependencies.csv with columns: module,gene,rho,pval,FDR
# Output: Fig6B_DependencyCorr_CLUSTERED.png (+ PDF if possible)

PATH = "shared_dependencies.csv"
OUT  = "Fig6B_DependencyCorr_CLUSTERED"
MODULES = ["SAT","IGE","IL2","MPC"]

import pandas as pd, numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ---- Font: Arial (fallback to DejaVu) ----
try:
    from matplotlib import font_manager
    if any("arial" in f.name.lower() for f in font_manager.fontManager.ttflist):
        plt.rcParams["font.family"] = "Arial"
    else:
        plt.rcParams["font.family"] = "DejaVu Sans"
except Exception:
    plt.rcParams["font.family"] = "DejaVu Sans"

# ---- Load & build module×module Spearman corr across genes ----
df = pd.read_csv(PATH)
df["module"] = df["module"].astype(str).str.upper().str.strip()
df = df[df["module"].isin(MODULES)]
mat = (df.pivot_table(index="gene", columns="module", values="rho")
         .reindex(columns=MODULES))
corr = mat.corr(method="spearman").astype(float)
labels = corr.index.tolist()

# Gray out diagonal for plotting
C = corr.copy()
np.fill_diagonal(C.values, np.nan)
vmin, vmax = -1.0, 1.0
cmap = plt.cm.get_cmap("coolwarm").copy()
cmap.set_bad(color="lightgray")  # diagonal

# Try seaborn.clustermap first (most reliable dendrograms)
try:
    import seaborn as sns
    from scipy.spatial.distance import squareform
    from scipy.cluster.hierarchy import linkage

    # Precompute average-linkage on 1 - corr (condensed)
    D_condensed = squareform((1.0 - corr).clip(0, 2).values, checks=False)
    Z = linkage(D_condensed, method="average")

    # Build clustermap. Use our precomputed linkages for both rows/cols.
    # Place a wider colorbar (x, y, width, height) in figure coords.
    cg = sns.clustermap(
        C, row_linkage=Z, col_linkage=Z, cmap=cmap, vmin=vmin, vmax=vmax,
        figsize=(8.2, 8.2), xticklabels=True, yticklabels=True,
        dendrogram_ratio=(0.22, 0.22), cbar_pos=(0.86, 0.25, 0.04, 0.5)
    )

    # Remove cbar outline
    try: cg.cax.outline.set_visible(False)
    except Exception: pass

    # Title
    cg.ax_heatmap.set_title("Pairwise Dependency Correlations (Spearman, clustered)")

    # Black numeric annotations for off-diagonals in the reordered matrix
    data2d = cg.data2d  # reordered DataFrame
    for i, row in enumerate(data2d.index):
        for j, col in enumerate(data2d.columns):
            val = data2d.iat[i, j]
            if not np.isnan(val):
                cg.ax_heatmap.text(j, i, f"{val:.2f}", ha="center", va="center",
                                   color="black", fontsize=9)

    # Save
    cg.fig.savefig(f"{OUT}.png", dpi=300, bbox_inches="tight")
    try:
        cg.fig.savefig(f"{OUT}.pdf", dpi=300, bbox_inches="tight")
    except Exception as e:
        print(f"⚠️ PDF save failed; PNG written. Reason: {e}")
    plt.close(cg.fig)
    print(f"✅ Saved {OUT}.png (+ PDF if available) [seaborn.clustermap]")
except Exception as seaborn_err:
    # ---- Matplotlib/SciPy fallback with explicit dendrogram axes ----
    from matplotlib.gridspec import GridSpec
    try:
        from scipy.spatial.distance import squareform
        from scipy.cluster.hierarchy import linkage, dendrogram
        D_condensed = squareform((1.0 - corr).clip(0, 2).values, checks=False)
        Z = linkage(D_condensed, method="average")
    except Exception:
        Z = None

    fig = plt.figure(figsize=(8.0, 8.0))
    gs = GridSpec(nrows=2, ncols=2, height_ratios=[0.30, 1.0], width_ratios=[0.30, 1.0], figure=fig)
    ax_dtop  = fig.add_subplot(gs[0, 1])
    ax_dleft = fig.add_subplot(gs[1, 0])
    ax_hm    = fig.add_subplot(gs[1, 1])

    # Order from Z (or eigen fallback)
    if Z is not None:
        leaves = dendrogram(Z, no_plot=True)["leaves"]
    else:
        w, v = np.linalg.eig(corr.values)
        leaves = list(np.argsort(np.real(v[:, np.argmax(np.real(w))])))

    labels_ord = [labels[i] for i in leaves]
    C_ord = C.values[np.ix_(leaves, leaves)]

    # Draw dendrograms with safe defaults (and then force line color/width)
    if Z is not None:
        dtop = dendrogram(Z, ax=ax_dtop, no_labels=True, color_threshold=None)
        ax_dtop.set_xticks([]); ax_dtop.set_yticks([])
        for spine in ax_dtop.spines.values(): spine.set_visible(False)
        for line in ax_dtop.get_lines():
            line.set_color("black"); line.set_linewidth(1.6)

        dleft = dendrogram(Z, ax=ax_dleft, orientation="right", no_labels=True, color_threshold=None)
        ax_dleft.set_xticks([]); ax_dleft.set_yticks([])
        for spine in ax_dleft.spines.values(): spine.set_visible(False)
        for line in ax_dleft.get_lines():
            line.set_color("black"); line.set_linewidth(1.6)
    else:
        ax_dtop.axis("off")
        ax_dleft.axis("off")

    # Heatmap
    im = ax_hm.imshow(C_ord, vmin=vmin, vmax=vmax, cmap=cmap)
    ax_hm.set_xticks(range(len(labels_ord))); ax_hm.set_yticks(range(len(labels_ord)))
    ax_hm.set_xticklabels(labels_ord); ax_hm.set_yticklabels(labels_ord)
    ax_hm.set_title("Pairwise Dependency Correlations (Spearman, clustered)")

    # Black numeric annotations (skip diagonal NaNs)
    for i in range(len(labels_ord)):
        for j in range(len(labels_ord)):
            if not np.isnan(C_ord[i, j]):
                ax_hm.text(j, i, f"{C_ord[i,j]:.2f}", ha="center", va="center",
                           color="black", fontsize=10)

    # Wider colorbar, no outline
    cbar = fig.colorbar(im, ax=ax_hm, fraction=0.06, pad=0.02)
    try: cbar.outline.set_visible(False)
    except Exception: pass
    cbar.set_label("Spearman correlation", rotation=90)

    plt.tight_layout()
    fig.savefig(f"{OUT}.png", dpi=300, bbox_inches="tight")
    try:
        fig.savefig(f"{OUT}.pdf", dpi=300, bbox_inches="tight")
    except Exception as e:
        print(f"⚠️ PDF save failed; PNG written. Reason: {e}")
    plt.close(fig)
    print(f"✅ Saved {OUT}.png (+ PDF if available) [matplotlib fallback]")
    if 'seaborn_err' in locals():
        print(f"(Seaborn path failed with: {seaborn_err})")


  cmap = plt.cm.get_cmap("coolwarm").copy()


✅ Saved Fig6B_DependencyCorr_CLUSTERED.png (+ PDF if available) [matplotlib fallback]
(Seaborn path failed with: module 'matplotlib.cm' has no attribute 'register_cmap')


In [23]:
#!/usr/bin/env python3
# Fig 6B — Pairwise dependency correlations (side dendrogram only, spaced colorbar)

import pandas as pd, numpy as np
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage

plt.rcParams.update({
    "font.size": 10, "axes.titlesize": 12, "xtick.labelsize": 11, "ytick.labelsize": 11,
    "figure.dpi": 300, "savefig.dpi": 300, "figure.facecolor": "white", "savefig.facecolor": "white",
})
try:
    from matplotlib import font_manager
    if any("arial" in f.name.lower() for f in font_manager.fontManager.ttflist):
        plt.rcParams["font.family"] = "Arial"
    else:
        plt.rcParams["font.family"] = "DejaVu Sans"
except Exception:
    plt.rcParams["font.family"] = "DejaVu Sans"

PATH = "shared_dependencies.csv"
OUT  = "Fig6B_DependencyCorr_SIDEonly_final"
MODULES = ["SAT","IGE","IL2","MPC"]

df = pd.read_csv(PATH)
df["module"] = df["module"].astype(str).str.upper().str.strip()
df = df[df["module"].isin(MODULES)]
mat = df.pivot_table(index="gene", columns="module", values="rho").reindex(columns=MODULES)
corr = mat.corr(method="spearman").astype(float)

C = corr.copy()
np.fill_diagonal(C.values, np.nan)
cmap = plt.cm.get_cmap("coolwarm").copy()
cmap.set_bad("lightgray")

D = squareform((1.0 - corr).clip(0, 2).values, checks=False)
Z = linkage(D, method="average")

sns.set_context("notebook")
cg = sns.clustermap(
    C,
    row_linkage=Z,
    col_linkage=Z,
    cmap=cmap, vmin=-1, vmax=1,
    figsize=(5.2, 5.2),
    xticklabels=True, yticklabels=True,
    dendrogram_ratio=(0.18, 0.001),      # only side dendrogram
    cbar_pos=(0.95, 0.3, 0.03, 0.5),     # more gap & simple 0.5 height
)

# Hide top dendrogram
cg.ax_col_dendrogram.set_visible(False)

# Title and cleanup
cg.ax_heatmap.set_title("Pairwise Module Dependency Correlations", pad=10)
cg.ax_heatmap.set_xlabel(""); cg.ax_heatmap.set_ylabel("")

# Style side dendrogram
for line in cg.ax_row_dendrogram.collections:
    line.set_color("black"); line.set_linewidth(1.1)
cg.ax_row_dendrogram.set_xticks([]); cg.ax_row_dendrogram.set_yticks([])
for s in cg.ax_row_dendrogram.spines.values(): s.set_visible(False)

# Colorbar with adjusted label
cbar = cg.ax_heatmap.collections[0].colorbar
cbar.set_label("Correlation", rotation=90)
try: cbar.outline.set_visible(False)
except Exception: pass

cg.fig.tight_layout()
cg.fig.savefig(f"{OUT}.png", bbox_inches="tight")
try: cg.fig.savefig(f"{OUT}.pdf", bbox_inches="tight")
except Exception as e: print(f"⚠️ PDF save failed; {e}")
try: cg.fig.savefig(f"{OUT}.svg", bbox_inches="tight")
except Exception: pass

print(f"✅ Saved {OUT}.png (+ PDF/SVG if available)")


AttributeError: module 'matplotlib.cm' has no attribute 'register_cmap'

In [30]:
# === Fig 6A — Module-specific dependency signatures (no seaborn) ===
# Run as a single cell or script in the directory containing your dependency CSV.

import os
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")  # safe in headless / notebooks
import matplotlib.pyplot as plt

# ---------------- USER SETTINGS ----------------
# If PATH_IN is None, will try these in order:
CANDIDATE_FILES = ["all_dependencies.csv", "shared_dependencies.csv"]
OUT_PREFIX      = "Fig6A_ModuleDependencyHeatmap"

MODULES = ["IGE", "SAT", "IL2", "MPC"]

# Curated genes per module (you can tweak this list if needed)
CURATED = {
    "IGE": ["VPS4B","UTP23","HSCB","MRPL13","MRPS18B","NDUFAF3","NDUFS6",
            "POLR1A","POLR1C","EIF3C","EIF4A1","DHX37"],
    "SAT": ["HSPA5","HSP90B1","DNAJB11","PSMC3","PSMD2","VCP","ATF4","DDIT3"],
    "IL2": ["JAK1","STAT5B","IL6ST","SOCS1","SOCS3","IRF1","IRF7","PSME1","SEC61A1"],
    "MPC": ["RHOA","RAC1","CDC42","PAK4","ARF6","RAB11FIP1","VIM","CTNNB1","PTK2"],
}

# ---------------- LOAD DATA ----------------
path_used = None
for fname in CANDIDATE_FILES:
    if os.path.exists(fname):
        path_used = fname
        break

if path_used is None:
    raise FileNotFoundError(
        f"Could not find any of: {', '.join(CANDIDATE_FILES)} "
        "in the current directory."
    )

print(f"[INFO] Using dependency file: {path_used}")
df = pd.read_csv(path_used)

# Normalize columns
for col in ["module", "gene"]:
    df[col] = df[col].astype(str).str.upper().str.strip()

df = df.drop_duplicates(subset=["module", "gene"])
df = df[df["module"].isin(MODULES)].copy()

if "rho" not in df.columns:
    raise ValueError("Dependency file must have a 'rho' column (correlation / effect).")

# ---------------- BUILD MATRIX (genes x modules) ----------------
# Flatten curated gene order
display_genes = []
for m in MODULES:
    for g in CURATED[m]:
        g_up = g.upper()
        if g_up not in display_genes:
            display_genes.append(g_up)

# Pivot to matrix
mat = df.pivot_table(index="gene", columns="module", values="rho", aggfunc="mean")

# Ensure curated genes are present as rows (NaN if not in data)
for g in display_genes:
    if g not in mat.index:
        mat.loc[g] = np.nan

# Reorder rows and columns
mat = mat.loc[display_genes, MODULES]

# Drop genes with no data in any module
mat = mat.loc[mat.notna().any(axis=1)]

print(f"[INFO] Matrix shape (genes x modules): {mat.shape}")

if mat.empty:
    raise ValueError("No data available for curated genes/modules after filtering.")

# ---------------- PLOT HEATMAP ----------------
fig_height = max(3.0, 0.3 * mat.shape[0] + 1.5)  # scale height with #genes
fig, ax = plt.subplots(figsize=(4.5, fig_height), dpi=300)

# imshow expects a 2D numpy array
im = ax.imshow(mat.values, aspect="auto", interpolation="nearest")

# Axis labels / ticks
ax.set_xticks(range(len(MODULES)))
ax.set_xticklabels(MODULES, rotation=0)
ax.set_yticks(range(mat.shape[0]))
ax.set_yticklabels(mat.index)

ax.set_xlabel("Module")
ax.set_ylabel("Gene")
ax.set_title("Fig 6A — Module-specific dependency signatures")

# Colorbar
cbar = fig.colorbar(im, ax=ax)
cbar.set_label("Dependency (ρ)")

fig.tight_layout()

# Save
png_path = f"{OUT_PREFIX}.png"
pdf_path = f"{OUT_PREFIX}.pdf"
fig.savefig(png_path, bbox_inches="tight")
fig.savefig(pdf_path, bbox_inches="tight")
plt.close(fig)

print(f"[DONE] Saved heatmap to:\n  {png_path}\n  {pdf_path}")
print(f"        Source file: {path_used}")


[INFO] Using dependency file: all_dependencies.csv
[INFO] Matrix shape (genes x modules): (37, 4)
[DONE] Saved heatmap to:
  Fig6A_ModuleDependencyHeatmap.png
  Fig6A_ModuleDependencyHeatmap.pdf
        Source file: all_dependencies.csv


In [44]:
# === 4×4 Module Relatedness Heatmap (Gene Dependencies) ===
# Uses in-memory variables only: R or wide, MODULE_ORDER_HINT, CMAP_NAME, FONT_FAMILY

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm

# ----------------- SETUP -----------------
# Module order
modules = MODULE_ORDER_HINT if 'MODULE_ORDER_HINT' in globals() else ["IGE", "SAT", "IL2", "MPC"]

# Choose font family (Arial if available / requested)
if 'FONT_FAMILY' in globals():
    family = FONT_FAMILY
else:
    family = "Arial"

def _has_font(name: str) -> bool:
    try:
        return any(name.lower() in f.name.lower() for f in fm.fontManager.ttflist)
    except Exception:
        return False

if not _has_font(family):
    family = "DejaVu Sans"

plt.rcParams.update({
    "font.family": family,
    "font.size": 8,
    "axes.titlesize": 9,
    "axes.labelsize": 9,
})

# ----------------- BUILD CORRELATION MATRIX -----------------
# Prefer existing R (already 4×4), otherwise compute from `wide`
if 'R' in globals():
    R_use = R.copy()
    # ensure right order and symmetry
    R_use = R_use.loc[modules, modules]
else:
    # `wide` should be a genes × modules matrix of dependency strengths
    if 'wide' not in globals():
        raise RuntimeError("Need either R or wide in the namespace to build the 4×4 correlation heatmap.")
    R_use = wide[modules].corr(method="spearman")

# ----------------- PLOT -----------------
fig, ax = plt.subplots(figsize=(3.2, 3.0), dpi=300)

# Symmetric color scale
vmax = float(np.nanmax(np.abs(R_use.values)))
vmin = -vmax

cmap_name = CMAP_NAME if 'CMAP_NAME' in globals() else "bwr"
im = ax.imshow(R_use.values, vmin=vmin, vmax=vmax, cmap=cmap_name)

# Axes and labels
ax.set_xticks(range(len(modules)))
ax.set_yticks(range(len(modules)))
ax.set_xticklabels(modules)
ax.set_yticklabels(modules)
plt.setp(ax.get_xticklabels(), rotation=0)

ax.set_xlabel("Module")
ax.set_ylabel("Module")
ax.set_title("Module relatedness by dependency profile")

# Annotate each cell with ρ
for i in range(len(modules)):
    for j in range(len(modules)):
        val = R_use.values[i, j]
        if np.isnan(val):
            txt = "NA"
        else:
            txt = f"{val:.2f}"
        ax.text(j, i, txt, ha="center", va="center", fontsize=7, color="black")

# Colorbar
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label("Spearman ρ")

fig.tight_layout()
out_png = "Fig6_ModuleDependencyRelatedness_4x4.png"
fig.savefig(out_png, bbox_inches="tight", dpi=300)
plt.close(fig)

print(f"✅ Saved 4×4 module dependency relatedness heatmap → {out_png}")
print("    Matrix used:\n", R_use)


✅ Saved 4×4 module dependency relatedness heatmap → Fig6_ModuleDependencyRelatedness_4x4.png
    Matrix used:
 module       IGE       SAT       IL2       MPC
module                                        
IGE     1.000000  0.015206 -0.149216 -0.321214
SAT     0.015206  1.000000 -0.442228 -0.355593
IL2    -0.149216 -0.442228  1.000000 -0.456094
MPC    -0.321214 -0.355593 -0.456094  1.000000


In [43]:
whos

Variable                  Type                       Data/Info
--------------------------------------------------------------
A                         DataFrame                               ccle_name  1<...>n[33 rows x 1449 columns]
ABS_R                     float                      0.2
ABS_R_MIN                 float                      0.15
ACH                       Pattern                    re.compile('^ACH-\\d{6}$', re.IGNORECASE)
ACH_RE                    Pattern                    re.compile('^ACH-\\d{6}$', re.IGNORECASE)
ALIGN_DIR                 PosixPath                  out_prism_align
BASE                      PosixPath                  .
C                         DataFrame                  module       SAT       IG<...>49642  0.767857       NaN
CANDIDATE_FILES           list                       n=2
CBAR_LABEL                str                        Spearman ρ
CELLNAME_MAP              dict                       n=26
CHRON                     DataFrame             

In [83]:
# === Top-10 dependency gene matrix, ordered by module (pos→neg) with top dendrogram ===
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
from scipy.cluster.hierarchy import linkage, dendrogram

# -----------------------------
# 1. Locate dependency file
# -----------------------------
candidates = ["all_dependencies.csv", "strong_dependency_signatures_by_module.csv"]
PATH = next((f for f in candidates if os.path.exists(f)), None)
if PATH is None:
    raise FileNotFoundError("No dependency file found.")

print(f"Using dependency file: {PATH}")
df = pd.read_csv(PATH)
df.columns = [c.strip().lower() for c in df.columns]

required = {"module", "gene", "rho"}
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

df["module"] = df["module"].astype(str).str.upper().str.strip()
df["gene"]   = df["gene"].astype(str).str.upper().str.strip()

modules = ["IGE", "SAT", "IL2", "MPC"]
topN = 10

# -----------------------------
# 2. Select top dependencies per module
#    (store the top-10 gene list per module)
# -----------------------------
top_genes = {}
selected = []

for m in modules:
    sub = df[df["module"] == m].copy()
    if sub.empty:
        print(f"⚠️ no rows for module {m}")
        continue

    sub["absrho"] = sub["rho"].abs()
    # Top-N by |rho|
    picks = (
        sub.sort_values("absrho", ascending=False)
           .head(topN)["gene"]
           .tolist()
    )
    top_genes[m] = picks
    selected.extend(picks)

# unique gene universe across all modules (for matrix)
selected = sorted(set(selected))
print(f"Unique genes in union of module cores: {len(selected)}")

# -----------------------------
# 3. Build (genes × modules) matrix
# -----------------------------
mat = pd.DataFrame(index=selected, columns=modules, dtype=float)
for m in modules:
    sub = df[df["module"] == m].set_index("gene")["rho"]
    # reindex on global gene set
    mat[m] = sub.reindex(selected)

mat = mat.fillna(0.0)

# -----------------------------
# 4. Cluster columns (modules) only
# -----------------------------
Z_cols = linkage(mat.T.values, method="average")
col_leaves = dendrogram(Z_cols, no_plot=True)["leaves"]
ordered_modules = [mat.columns[i] for i in col_leaves]

# -----------------------------
# 5. Row order: by module blocks, pos→neg within each block
# -----------------------------
row_order = []

for m in modules:  # ensures IGE block first, then SAT, IL2, MPC
    genes_m = top_genes.get(m, [])
    if not genes_m:
        continue

    # restrict to that module's top-10 genes
    sub = df[(df["module"] == m) & (df["gene"].isin(genes_m))].copy()
    # in case of duplicate rows per gene, keep one
    sub = sub.drop_duplicates(subset=["gene"])

    # sort by rho descending → positive core first, negative last
    sub = sub.sort_values("rho", ascending=False)

    # extend row order by gene names
    row_order.extend(sub["gene"].tolist())

print(f"Total rows in heatmap order: {len(row_order)}")

# reorder matrix
mat_ord = mat.loc[row_order, ordered_modules]

# -----------------------------
# 6. Figure with top dendrogram and ordered heatmap
# -----------------------------
fig = plt.figure(figsize=(8, 11))

from matplotlib.gridspec import GridSpec
gs = fig.add_gridspec(
    nrows=2, ncols=2,
    height_ratios=[0.8, 6.5],
    width_ratios=[1.2, 6.0],
    hspace=0.02, wspace=0.02
)

# ---- top dendrogram (modules) ----
ax_top = fig.add_subplot(gs[0, 1])
dendrogram(Z_cols, ax=ax_top, color_threshold=0, no_labels=True)
ax_top.set_xticks([])
ax_top.set_yticks([])
for spine in ax_top.spines.values():
    spine.set_visible(False)

# ---- heatmap ----
ax_hm = fig.add_subplot(gs[1, 1])

norm = TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1)
im = ax_hm.imshow(mat_ord.values, aspect="auto", cmap="coolwarm", norm=norm)

# x-axis: clustered modules
ax_hm.set_xticks(range(len(ordered_modules)))
ax_hm.set_xticklabels(ordered_modules, rotation=45, ha="right", fontsize=14)

# y-axis: gene labels in module-block order
ax_hm.set_yticks(range(len(row_order)))
ax_hm.set_yticklabels(row_order, fontsize=8)

ax_hm.set_ylabel("Top dependency genes by module\n(pos core → neg core)", fontsize=13)

# colorbar
cbar = fig.colorbar(im, ax=ax_hm, fraction=0.03, pad=0.02)
cbar.set_label("Dependency (ρ)", fontsize=16)

plt.tight_layout()
plt.savefig("Fig_Module_Top10_ModuleBlocks_PosToNeg.png", dpi=300)
plt.close()

print("Saved → Fig_Module_Top10_ModuleBlocks_PosToNeg.png")


Using dependency file: all_dependencies.csv
Unique genes in union of module cores: 40
Total rows in heatmap order: 40


  plt.tight_layout()


Saved → Fig_Module_Top10_ModuleBlocks_PosToNeg.png


In [79]:
# === Top-10 dependency gene matrix with top dendrogram and top module labels ===
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
from scipy.cluster.hierarchy import linkage, dendrogram

# -----------------------------
# 0. Layout knobs you can tweak
# -----------------------------
TOP_HEIGHT   = 0.8    # height for dendrogram row
BOTTOM_HEIGHT = 6.5   # height for heatmap row
LEFT_WIDTH   = 1.2    # (unused col) for potential side dendrogram
RIGHT_WIDTH  = 6.0    # width for heatmap
HSPACE       = 0.1   # vertical space between dendrogram + heatmap
WSPACE       = 0.02   # horizontal space
X_TICK_PAD   = 4      # distance of module labels from heatmap axis
Y_LABEL_FSIZE = 18    # font size for y-axis label
X_TICK_FSIZE  = 16    # font size for module labels

# -----------------------------
# 1. Locate dependency file
# -----------------------------
candidates = ["all_dependencies.csv", "strong_dependency_signatures_by_module.csv"]
PATH = next((f for f in candidates if os.path.exists(f)), None)
if PATH is None:
    raise FileNotFoundError("No dependency file found.")

print(f"Using dependency file: {PATH}")
df = pd.read_csv(PATH)
df.columns = [c.strip().lower() for c in df.columns]

required = {"module","gene","rho"}
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

df["module"] = df["module"].astype(str).str.upper().str.strip()
df["gene"]   = df["gene"].astype(str).str.upper().str.strip()

modules = ["IGE","SAT","IL2","MPC"]
topN = 10

# -----------------------------
# 2. Select top dependencies per module
# -----------------------------
selected = []
for m in modules:
    sub = df[df["module"] == m].copy()
    if sub.empty:
        print(f"⚠️ no rows for module {m}")
        continue
    sub["absrho"] = sub["rho"].abs()
    picks = sub.sort_values("absrho", ascending=False).head(topN)["gene"].tolist()
    selected.extend(picks)

selected = sorted(set(selected))
print(f"Unique genes: {len(selected)}")

# -----------------------------
# 3. Build (genes × modules) matrix
# -----------------------------
mat = pd.DataFrame(index=selected, columns=modules, dtype=float)
for m in modules:
    sub = df[df["module"] == m].set_index("gene")["rho"]
    mat[m] = sub.reindex(selected)
mat = mat.fillna(0.0)

# -----------------------------
# 4. Cluster rows and columns
# -----------------------------
# columns (modules)
Z_cols = linkage(mat.T.values, method="average")
col_leaves = dendrogram(Z_cols, no_plot=True)["leaves"]

# rows (genes)
Z_rows = linkage(mat.values, method="average")
row_leaves = dendrogram(Z_rows, no_plot=True)["leaves"]

# reorder matrix
mat_ord = mat.iloc[row_leaves, col_leaves]
ordered_modules = mat_ord.columns.tolist()
ordered_genes   = mat_ord.index.tolist()

# -----------------------------
# 5. Figure with top dendrogram + top labels
# -----------------------------
fig = plt.figure(figsize=(8, 11))

gs = fig.add_gridspec(
    nrows=2, ncols=2,
    height_ratios=[TOP_HEIGHT, BOTTOM_HEIGHT],
    width_ratios=[LEFT_WIDTH, RIGHT_WIDTH],
    hspace=HSPACE, wspace=WSPACE
)


    
# ---- top dendrogram (modules) ----
ax_top = fig.add_subplot(gs[0, 1])
dendrogram(Z_cols, ax=ax_top, color_threshold=0, no_labels=True)

# === Shrink dendrogram width (NEW) ===
box = ax_top.get_position()

ax_top.set_position([
    box.x0 + 0.007,   # move it right (10% of figure width)
    box.y0,
    box.width * 0.93,   # shrink width to 60% (adjustable)
    box.height
])

# clean spines/ticks
ax_top.set_xticks([])
ax_top.set_yticks([])
for s in ax_top.spines.values():
    s.set_visible(False)


# ---- heatmap ----
ax_hm = fig.add_subplot(gs[1, 1])

norm = TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1)
im = ax_hm.imshow(mat_ord.values, aspect="auto", cmap="coolwarm", norm=norm)

# Put module labels at TOP (under dendrogram)
ax_hm.xaxis.tick_top()
ax_hm.set_xticks(range(len(ordered_modules)))
ax_hm.set_xticklabels(ordered_modules, rotation=0, fontsize=X_TICK_FSIZE)
ax_hm.tick_params(axis="x", pad=X_TICK_PAD)  # move labels up/down

# no row labels
ax_hm.set_yticks([])

# BIGGER y-axis label
ax_hm.set_ylabel("Top dependency genes", fontsize=16)

cbar = fig.colorbar(im, ax=ax_hm, fraction=0.03, pad=0.02)
cbar.set_label("Dependency (ρ)", fontsize=16)

plt.tight_layout()
plt.savefig("Fig_Module_Top10_with_TopDendrogram_TOPLABELS.png", dpi=300)
plt.close()

print("Saved → Fig_Module_Top10_with_TopDendrogram_TOPLABELS.png")


Using dependency file: all_dependencies.csv
Unique genes: 40


  plt.tight_layout()


Saved → Fig_Module_Top10_with_TopDendrogram_TOPLABELS.png


In [77]:
# === Top-10 dependency gene matrix with top + side dendrograms + improved axis label ===
import os, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm
from scipy.cluster.hierarchy import linkage, dendrogram

# -----------------------------
# 1. Locate dependency file
# -----------------------------
candidates = ["all_dependencies.csv", "strong_dependency_signatures_by_module.csv"]
PATH = next((f for f in candidates if os.path.exists(f)), None)
if PATH is None:
    raise FileNotFoundError("No dependency file found.")

print(f"Using dependency file: {PATH}")
df = pd.read_csv(PATH)
df.columns = [c.strip().lower() for c in df.columns]

required = {"module","gene","rho"}
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

df["module"] = df["module"].astype(str).str.upper().str.strip()
df["gene"]   = df["gene"].astype(str).str.upper().str.strip()

modules = ["IGE","SAT","IL2","MPC"]
topN = 10

# -----------------------------
# 2. Select top dependencies per module
# -----------------------------
selected = []
for m in modules:
    sub = df[df["module"] == m].copy()
    if sub.empty:
        print(f"⚠️ no rows for module {m}")
        continue
    sub["absrho"] = sub["rho"].abs()
    picks = sub.sort_values("absrho", ascending=False).head(topN)["gene"].tolist()
    selected.extend(picks)

selected = sorted(set(selected))
print(f"Unique genes: {len(selected)}")

# -----------------------------
# 3. Build (genes × modules) matrix
# -----------------------------
mat = pd.DataFrame(index=selected, columns=modules, dtype=float)
for m in modules:
    sub = df[df["module"] == m].set_index("gene")["rho"]
    mat[m] = sub.reindex(selected)
mat = mat.fillna(0.0)

# -----------------------------
# 4. Cluster rows and columns
# -----------------------------
Z_cols  = linkage(mat.T.values, method="average")
col_leaves = dendrogram(Z_cols, no_plot=True)["leaves"]

Z_rows  = linkage(mat.values, method="average")
row_leaves = dendrogram(Z_rows, no_plot=True)["leaves"]

mat_ord = mat.iloc[row_leaves, col_leaves]
ordered_modules = mat_ord.columns.tolist()
ordered_genes   = mat_ord.index.tolist()

# -----------------------------
# 5. Figure with top dendrogram + labels + heatmap
# -----------------------------
fig = plt.figure(figsize=(8, 11))

gs = fig.add_gridspec(
    nrows=3, ncols=2,
    height_ratios=[0.8, 0.35, 6.5],
    width_ratios=[1.8, 6.0],
    hspace=0.05, wspace=0.05
)

# ---- top dendrogram ----
ax_top = fig.add_subplot(gs[0, 1])
dendrogram(Z_cols, ax=ax_top, color_threshold=0, no_labels=True)
ax_top.set_xticks([]); ax_top.set_yticks([])
for s in ax_top.spines.values():
    s.set_visible(False)

# ---- module labels under dendrogram ----
ax_labels = fig.add_subplot(gs[1, 1])
ax_labels.set_xticks(range(len(ordered_modules)))
ax_labels.set_xticklabels(ordered_modules, rotation=45, ha="right", fontsize=16)
ax_labels.set_yticks([])
for s in ax_labels.spines.values():
    s.set_visible(False)

# ---- side dendrogram (genes) ----
#ax_side = fig.add_subplot(gs[2, 0])
#dendrogram(Z_rows, ax=ax_side, orientation="right", color_threshold=0, no_labels=True)
#ax_side.set_xticks([]); ax_side.set_yticks([])
#ax_side.invert_yaxis()
#for s in ax_side.spines.values():
#   s.set_visible(False)

# ---- heatmap ----
ax_hm = fig.add_subplot(gs[2, 1])
norm = TwoSlopeNorm(vmin=-1, vcenter=0, vmax=1)
im = ax_hm.imshow(mat_ord.values, aspect="auto", cmap="coolwarm", norm=norm)

# NO row labels
ax_hm.set_yticks([])

# NO column labels here (we already show them above)
ax_hm.set_xticks([])

# **Bigger y-axis label**
ax_hm.set_ylabel("Top dependency genes", fontsize=16, labelpad=10)

cbar = fig.colorbar(im, ax=ax_hm, fraction=0.03, pad=0.02)
cbar.set_label("Dependency (ρ)", fontsize=16)

plt.tight_layout()
plt.savefig("Fig_Module_Top10_with_Top_SideDendrogram_BIGLABEL.png", dpi=300)
plt.close()

print("Saved → Fig_Module_Top10_with_Top_SideDendrogram_BIGLABEL.png")


Using dependency file: all_dependencies.csv
Unique genes: 40


  plt.tight_layout()


Saved → Fig_Module_Top10_with_Top_SideDendrogram_BIGLABEL.png


In [57]:
# === Fig 6A — 4×4 module dependency correlation with top dendrogram, no title ===
# Assumes you already have:
#   R  -> DataFrame, index = modules, columns = modules, Spearman ρ

import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform
import matplotlib.gridspec as gridspec

# ----------------- prepare correlation matrix -----------------
corr = R.copy()  # R is your existing 4×4 module–dependency correlation matrix
modules = list(corr.index)

# enforce symmetry and unit diagonal (just in case of tiny numerical asymmetries)
corr.values[np.diag_indices_from(corr)] = 1.0
corr = (corr + corr.T) / 2.0

# ----------------- hierarchical clustering (modules) -----------------
# distance = 1 − ρ
dist = 1.0 - corr.values
# convert to condensed form for linkage
dist_cond = squareform(dist, checks=False)

Z = linkage(dist_cond, method="average")
den = dendrogram(Z, no_plot=True, labels=modules)
order = den["leaves"]
modules_ord = [modules[i] for i in order]

# re-order rows AND columns with the same order
corr_ord = corr.loc[modules_ord, modules_ord]

# ----------------- plotting -----------------
plt.rcParams.update({
    "font.family": "Arial",   # will fall back to DejaVu Sans if Arial not installed
    "font.size": 10
})

fig = plt.figure(figsize=(3.5, 4.0), dpi=300)
gs = gridspec.GridSpec(
    2, 1,
    height_ratios=[0.6, 3.4],  # shallow dendrogram on top
    hspace=0.05
)

# Top: dendrogram (hierarchical tree), no title, no ticks
ax_den = fig.add_subplot(gs[0])
dendrogram(
    Z,
    labels=modules_ord,
    ax=ax_den,
    color_threshold=None,
)
ax_den.set_xticks([])
ax_den.set_yticks([])
for spine in ax_den.spines.values():
    spine.set_visible(False)

# Bottom: clustered heatmap
ax_hm = fig.add_subplot(gs[1])
im = ax_hm.imshow(
    corr_ord.values,
    vmin=-1.0,
    vmax=1.0,
    cmap="coolwarm",
    aspect="equal"
)

ax_hm.set_xticks(range(len(modules_ord)))
ax_hm.set_yticks(range(len(modules_ord)))
ax_hm.set_xticklabels(modules_ord, rotation=0)
ax_hm.set_yticklabels(modules_ord)

# no title
# optional axis labels (comment out if you want completely clean)
ax_hm.set_xlabel("Module")
ax_hm.set_ylabel("Module")

# colorbar
cbar = fig.colorbar(im, ax=ax_hm, fraction=0.046, pad=0.04)
cbar.set_label("Spearman\u00a0\u03c1")

fig.tight_layout()
fig.savefig("Fig6A_ModuleDependency_4x4_dendrogramTop.png", bbox_inches="tight", dpi=300)
fig.savefig("Fig6A_ModuleDependency_4x4_dendrogramTop.pdf", bbox_inches="tight")
plt.close(fig)


  fig.tight_layout()


In [89]:
# === Fig 6A: Strong Dependency Signatures by Module (dependencies only) ===
# Inputs:  all_dependencies.csv  with columns: module,gene,rho,pval,FDR,kind
# Output:  Fig6A_StrongDependencies.png
#
# Notes:
# - We keep only "kind == dependency"
# - We include ONLY negative correlations (dependency) by default (KEEP_NEG=True)
# - We select top N by absolute rho per module, after thresholds.

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# ------------------- CONFIG -------------------
CSV_IN = "all_dependencies.csv"
OUT_PNG = "Fig6A_StrongDependencies.png"

MODULES = ["IGE", "SAT", "IL2", "MPC"]   # display order
FDR_MAX   = 0.9   # allow slightly higher q-values
ABS_R_MIN = 0.05   # accept moderately strong dependencies
TOP_N_PER_MODULE = 6

KEEP_NEG = True   # True = keep rho <= -ABS_R_MIN (dependencies); False = keep rho >= +ABS_R_MIN (resistance)

# Fonts / style
matplotlib.rcParams.update({
    "font.family": "Arial",
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
})

# Left color strip for module groups (choose any you like)
MODULE_COLORS = {
    "IGE": "#2ca25f",   # green
    "SAT": "#ef8a62",   # salmon
    "IL2": "#67a9cf",   # blue
    "MPC": "#984ea3",   # purple
}

# ------------------- LOAD -------------------
df = pd.read_csv(CSV_IN)
# Exclude spurious or irrelevant genes
EXCLUDE_GENES = {"OR1Q1","LHFPL1","KIF4B",'ASPM','C16orf89'}       # add more if needed
df = df[~df["gene"].isin(EXCLUDE_GENES)]


required = {"module","gene","rho","FDR"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in {CSV_IN}: {sorted(missing)}")

# If a 'kind' column exists, keep only 'dependency'
if "kind" in df.columns:
    df = df[df["kind"].str.lower() == "dependency"]

# Clean gene names
df["gene"] = df["gene"].astype(str).str.strip()
df["module"] = df["module"].astype(str).str.strip()

# ------------------- FILTER PER MODULE -------------------
display_rows = []   # (module, gene)
for mod in MODULES:
    sub = df[df["module"] == mod].copy()

    # Keep only strong correlation in the dependency direction
    if KEEP_NEG:
        sub = sub[(sub["rho"] <= -ABS_R_MIN) & (sub["FDR"] <= FDR_MAX)]
        # sort by most negative first
        sub = sub.sort_values("rho")  # more negative on top
    else:
        sub = sub[(sub["rho"] >= +ABS_R_MIN) & (sub["FDR"] <= FDR_MAX)]
        sub = sub.sort_values("rho", ascending=False)

    if len(sub) == 0:
        # no hits; leave empty block for this module
        continue

    # top-N for visual balance
    sub = sub.head(TOP_N_PER_MODULE)
    display_rows.extend(list(zip(sub["module"], sub["gene"])))

# If nothing passed, bail gracefully
if not display_rows:
    raise SystemExit("No genes passed the dependency-only filters. Try relaxing FDR_MAX or ABS_R_MIN.")

# Create ordered Gene index grouped by module
row_labels = []
row_modules = []
for mod in MODULES:
    genes = [g for (m,g) in display_rows if m == mod]
    for g in genes:
        row_labels.append(g)
        row_modules.append(mod)

# Build full matrix (genes x modules) from rho (can include +/- across columns for context)
mat = pd.DataFrame(index=row_labels, columns=MODULES, dtype=float)
mat[:] = np.nan
for _, r in df[df["gene"].isin(row_labels)].iterrows():
    if r["module"] in MODULES:
        # note: rows are genes; columns are *display* modules; we keep the rho for each module column where available
        # If your CSV is one row per (module,gene), this assignment fills the matching column only.
        mat.loc[r["gene"], r["module"]] = r["rho"]

# Replace remaining NaNs with 0 for display (neutral color)
mat = mat.fillna(0.0)

# ------------------- PLOT -------------------
fig = plt.figure(figsize=(7.0, 10.0), dpi=300)
# layout: [left color band | heatmap | colorbar]
gs = fig.add_gridspec(nrows=1, ncols=3, width_ratios=[0.18, 0.92, 0.12], wspace=0.28)

# Left module color band
ax_band = fig.add_subplot(gs[0,0])
ax_band.set_xlim(0,1)
ax_band.set_ylim(0, len(row_labels))
y = 0
tick_positions = []
tick_labels = []
for mod in MODULES:
    n = sum(np.array(row_modules) == mod)
    if n == 0:
        continue
    ax_band.add_patch(plt.Rectangle((0, y), 1, n, color=MODULE_COLORS.get(mod, "#888888"), lw=0))
    tick_positions.append(y + n/2)
    tick_labels.append(mod)
    y += n

ax_band.set_xticks([])
ax_band.set_yticks(tick_positions, tick_labels)
for spine in ax_band.spines.values():
    spine.set_visible(False)
ax_band.invert_yaxis()
ax_band.set_title("")  # no title on the band

# Heatmap
ax_hm = fig.add_subplot(gs[0,1])
cmap = matplotlib.colormaps.get_cmap("coolwarm")
# symmetric around 0
norm = TwoSlopeNorm(vmin=-1.0, vcenter=0.0, vmax=1.0)
im = ax_hm.imshow(mat.values, aspect="auto", cmap=cmap, norm=norm, interpolation="nearest")

ax_hm.set_xticks(np.arange(len(MODULES)))
ax_hm.set_xticklabels(MODULES)
ax_hm.set_yticks(np.arange(len(row_labels)))
ax_hm.set_yticklabels(row_labels)
ax_hm.tick_params(axis="both", length=0)

# gridlines to separate modules vertically (optional)
y0 = 0
for mod in MODULES:
    n = sum(np.array(row_modules) == mod)
    if n == 0:
        continue
    ax_hm.hlines(y0-0.5, -0.5, len(MODULES)-0.5, colors="white", linewidth=1.2)
    y0 += n
ax_hm.hlines(y0-0.5, -0.5, len(MODULES)-0.5, colors="white", linewidth=1.2)

ax_hm.set_xlabel("Module")
ax_hm.set_ylabel("Gene")
ax_hm.set_title("Strong Dependency Signatures by Module", pad=10)

# Colorbar (with 0.5 tick spacing as you preferred earlier)
ax_cb = fig.add_subplot(gs[0,2])
cbar = fig.colorbar(im, cax=ax_cb)
cbar.set_label("Correlation", rotation=90)
cbar.set_ticks([-1.0, -0.5, 0.0, 0.5, 1.0])

fig.tight_layout()
fig.savefig(OUT_PNG, bbox_inches="tight")
print(f"✅ Wrote {OUT_PNG}\n  filters: dependency-only (ρ {'≤ -' if KEEP_NEG else '≥ +'}{ABS_R_MIN}), FDR ≤ {FDR_MAX}, top {TOP_N_PER_MODULE}/module")


  fig.tight_layout()


✅ Wrote Fig6A_StrongDependencies.png
  filters: dependency-only (ρ ≤ -0.05), FDR ≤ 0.9, top 6/module


In [93]:
# === Fig 6A: Strong Dependency Signatures by Module (dependencies only) ===
# Inputs:  all_dependencies.csv  with columns: module,gene,rho,pval,FDR,kind
# Output:  Fig6A_StrongDependencies.png
#
# Notes:
# - We keep only "kind == dependency"
# - We include ONLY negative correlations (dependency) by default (KEEP_NEG=True)
# - We select top N by absolute rho per module, after thresholds.

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import TwoSlopeNorm

# ------------------- CONFIG -------------------
CSV_IN = "all_dependencies.csv"
OUT_PNG = "Fig6A_StrongDependencies.png"

MODULES = ["IGE", "SAT", "IL2", "MPC"]   # display order
FDR_MAX   = 0.9   # allow slightly higher q-values
ABS_R_MIN = 0.05   # accept moderately strong dependencies
TOP_N_PER_MODULE = 5

KEEP_NEG = True   # True = keep rho <= -ABS_R_MIN (dependencies); False = keep rho >= +ABS_R_MIN (resistance)

# Fonts / style
matplotlib.rcParams.update({
    "font.family": "Arial",
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
})

# Left color strip for module groups (choose any you like)
MODULE_COLORS = {
    "IGE": "#2ca25f",   # green
    "SAT": "#ef8a62",   # salmon
    "IL2": "#67a9cf",   # blue
    "MPC": "#984ea3",   # purple
}

# ------------------- LOAD -------------------
df = pd.read_csv(CSV_IN)
# Exclude spurious or irrelevant genes
EXCLUDE_GENES = {"OR1Q1","LHFPL1","KIF4B",'ASPM','C16orf89'}       # add more if needed
df = df[~df["gene"].isin(EXCLUDE_GENES)]


required = {"module","gene","rho","FDR"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in {CSV_IN}: {sorted(missing)}")

# If a 'kind' column exists, keep only 'dependency'
if "kind" in df.columns:
    df = df[df["kind"].str.lower() == "dependency"]

# Clean gene names
df["gene"] = df["gene"].astype(str).str.strip()
df["module"] = df["module"].astype(str).str.strip()

# ------------------- FILTER PER MODULE -------------------
display_rows = []   # (module, gene)
for mod in MODULES:
    sub = df[df["module"] == mod].copy()

    # Keep only strong correlation in the dependency direction
    if KEEP_NEG:
        sub = sub[(sub["rho"] <= -ABS_R_MIN) & (sub["FDR"] <= FDR_MAX)]
        # sort by most negative first
        sub = sub.sort_values("rho")  # more negative on top
    else:
        sub = sub[(sub["rho"] >= +ABS_R_MIN) & (sub["FDR"] <= FDR_MAX)]
        sub = sub.sort_values("rho", ascending=False)

    if len(sub) == 0:
        # no hits; leave empty block for this module
        continue

    # top-N for visual balance
    sub = sub.head(TOP_N_PER_MODULE)
    display_rows.extend(list(zip(sub["module"], sub["gene"])))

# If nothing passed, bail gracefully
if not display_rows:
    raise SystemExit("No genes passed the dependency-only filters. Try relaxing FDR_MAX or ABS_R_MIN.")

# Create ordered Gene index grouped by module
row_labels = []
row_modules = []
for mod in MODULES:
    genes = [g for (m,g) in display_rows if m == mod]
    for g in genes:
        row_labels.append(g)
        row_modules.append(mod)

# Build full matrix (genes x modules) from rho (can include +/- across columns for context)
mat = pd.DataFrame(index=row_labels, columns=MODULES, dtype=float)
mat[:] = np.nan
for _, r in df[df["gene"].isin(row_labels)].iterrows():
    if r["module"] in MODULES:
        # note: rows are genes; columns are *display* modules; we keep the rho for each module column where available
        # If your CSV is one row per (module,gene), this assignment fills the matching column only.
        mat.loc[r["gene"], r["module"]] = r["rho"]

# Replace remaining NaNs with 0 for display (neutral color)
mat = mat.fillna(0.0)

# ------------------- PLOT -------------------
fig = plt.figure(figsize=(7.0, 10.0), dpi=300)
# layout: [left color band | heatmap | colorbar]
gs = fig.add_gridspec(nrows=1, ncols=3, width_ratios=[0.18, 0.92, 0.12], wspace=0.28)



# Heatmap
ax_hm = fig.add_subplot(gs[0,1])
cmap = matplotlib.colormaps.get_cmap("coolwarm")
# symmetric around 0
norm = TwoSlopeNorm(vmin=-1.0, vcenter=0.0, vmax=1.0)
im = ax_hm.imshow(mat.values, aspect="auto", cmap=cmap, norm=norm, interpolation="nearest")

ax_hm.set_xticks(np.arange(len(MODULES)))
ax_hm.set_xticklabels(MODULES)
ax_hm.set_yticks(np.arange(len(row_labels)))
ax_hm.set_yticklabels(row_labels)
ax_hm.tick_params(axis="both", length=0)

# gridlines to separate modules vertically (optional)
y0 = 0
for mod in MODULES:
    n = sum(np.array(row_modules) == mod)
    if n == 0:
        continue
    ax_hm.hlines(y0-0.5, -0.5, len(MODULES)-0.5, colors="white", linewidth=1.2)
    y0 += n
ax_hm.hlines(y0-0.5, -0.5, len(MODULES)-0.5, colors="white", linewidth=1.2)

ax_hm.set_xlabel("Module")
ax_hm.set_ylabel("Gene")
ax_hm.set_title("Strong Dependency Signatures by Module", pad=10)

# Colorbar (with 0.5 tick spacing as you preferred earlier)
ax_cb = fig.add_subplot(gs[0,2])
cbar = fig.colorbar(im, cax=ax_cb)
cbar.set_label("Correlation", rotation=90)
cbar.set_ticks([-1.0, -0.5, 0.0, 0.5, 1.0])

fig.tight_layout()
fig.savefig(OUT_PNG, bbox_inches="tight")
print(f"✅ Wrote {OUT_PNG}\n  filters: dependency-only (ρ {'≤ -' if KEEP_NEG else '≥ +'}{ABS_R_MIN}), FDR ≤ {FDR_MAX}, top {TOP_N_PER_MODULE}/module")


  fig.tight_layout()


✅ Wrote Fig6A_StrongDependencies.png
  filters: dependency-only (ρ ≤ -0.05), FDR ≤ 0.9, top 5/module


In [32]:
whos


Variable                  Type                       Data/Info
--------------------------------------------------------------
A                         DataFrame                               ccle_name  1<...>n[33 rows x 1449 columns]
ABS_R                     float                      0.2
ABS_R_MIN                 float                      0.05
ACH                       Pattern                    re.compile('^ACH-\\d{6}$', re.IGNORECASE)
ACH_RE                    Pattern                    re.compile('^ACH-\\d{6}$', re.IGNORECASE)
ALIGN_DIR                 PosixPath                  out_prism_align
BASE                      PosixPath                  .
C                         DataFrame                  module       SAT       IG<...>49642  0.767857       NaN
CANDIDATE_FILES           list                       n=2
CBAR_LABEL                str                        Correlation
CELLNAME_MAP              dict                       n=26
CHRON                     DataFrame            

In [34]:
# === Figure 6B — PRISM module–module correlation (clustered) ===
# Assumes you already have `multi` with columns ["IGE","SAT","IL2","MPC"]

import numpy as np
import matplotlib
matplotlib.use("Agg")   # safe in scripts / notebooks
import matplotlib.pyplot as plt
from matplotlib import colors, font_manager as fm, text as mtext
from scipy.cluster.hierarchy import linkage, leaves_list

# ---------------- settings ----------------
MODULES    = ["IGE", "SAT", "IL2", "MPC"]
OUT_BASENAME = "Fig6B_PRISM_ModuleCorr_CLUSTERED"
TITLE      = "PRISM Drug-Response Correlation Between Modules"
CBAR_LABEL = "Spearman ρ"
FONT_FAMILY = "Arial"   # will fall back if not installed
FIGSIZE    = (4.2, 4.2)

# ---------------- data: module–module correlation from PRISM ----------------
R = multi[MODULES].corr(method="spearman")   # 4 x 4 DataFrame

# hierarchical clustering: distance = 1 - correlation
dist = 1.0 - R.values
Z = linkage(dist, method="average")
order = leaves_list(Z)

R_ord = R.values[order][:, order]
labels_ord = [MODULES[i] for i in order]

# ---------------- plotting ----------------
norm = colors.TwoSlopeNorm(vmin=-1.0, vcenter=0.0, vmax=1.0)
cmap = plt.cm.coolwarm

fig, ax = plt.subplots(figsize=FIGSIZE, dpi=300)
im = ax.imshow(R_ord, cmap=cmap, norm=norm)

# ticks / labels
ax.set_xticks(np.arange(len(labels_ord)))
ax.set_yticks(np.arange(len(labels_ord)))
ax.set_xticklabels(labels_ord)
ax.set_yticklabels(labels_ord)
ax.set_xlabel("Module")
ax.set_ylabel("Module")
ax.set_title(TITLE, pad=10)

# rotate x labels
for lbl in ax.get_xticklabels():
    lbl.set_rotation(45)
    lbl.set_ha("right")

# annotate correlations
for i in range(len(labels_ord)):
    for j in range(len(labels_ord)):
        val = R_ord[i, j]
        ax.text(
            j, i, f"{val:.2f}",
            ha="center", va="center",
            fontsize=7, color="black"
        )

# colorbar
cbar = fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label(CBAR_LABEL)

# try to enforce Arial (fallback if unavailable)
def _has_font(name: str) -> bool:
    try:
        return any(name.lower() in f.name.lower() for f in fm.fontManager.ttflist)
    except Exception:
        return False

family = FONT_FAMILY if _has_font(FONT_FAMILY) else "DejaVu Sans"

for txt in fig.findobj(mtext.Text):
    if txt.get_text():
        try:
            txt.set_fontfamily(family)
        except Exception:
            pass

# tidy & save
fig.tight_layout()
fig.savefig(f"{OUT_BASENAME}.png", bbox_inches="tight")
fig.savefig(f"{OUT_BASENAME}.pdf", bbox_inches="tight")

print(f"✅ Saved {OUT_BASENAME}.png and .pdf")
print("Module order (clustered):", labels_ord)


  Z = linkage(dist, method="average")


✅ Saved Fig6B_PRISM_ModuleCorr_CLUSTERED.png and .pdf
Module order (clustered): ['MPC', 'IL2', 'IGE', 'SAT']


In [33]:
# === Fig 6B — PRISM module–drug correlations (no seaborn) ===

import numpy as np
import matplotlib.pyplot as plt

# Make sure rows are in a consistent order
modules_order = ["IGE", "SAT", "IL2", "MPC"]
mat = heat.copy()

# If module names are in a column instead of the index, fix that
if "module" in mat.columns:
    mat = mat.set_index("module")

mat = mat.loc[modules_order]  # re-order rows if all 4 are present

plt.rcParams["font.family"] = "Arial"  # will fall back silently if Arial not installed

fig, ax = plt.subplots(figsize=(7, 3.8), dpi=300)

# matrix values
vals = mat.values.astype(float)

im = ax.imshow(
    vals,
    aspect="auto",
    vmin=-1.0,
    vmax=1.0,
    cmap="coolwarm",
)

# ticks & labels
ax.set_yticks(np.arange(mat.shape[0]))
ax.set_yticklabels(mat.index, fontsize=9)

ax.set_xticks(np.arange(mat.shape[1]))
ax.set_xticklabels(mat.columns, rotation=90, ha="center", fontsize=6)

ax.set_xlabel("PRISM compounds", fontsize=10)
ax.set_ylabel("Module", fontsize=10)
ax.set_title("Module–Drug Response Correlations (PRISM)", fontsize=12, pad=10)

# colorbar
cbar = fig.colorbar(im, ax=ax)
cbar.set_label("Spearman ρ (module score vs PRISM AUC)", fontsize=9)
cbar.ax.tick_params(labelsize=8)

fig.tight_layout()
fig.savefig("Fig6B_PRISM_ModuleDrugHeatmap.png", bbox_inches="tight")
fig.savefig("Fig6B_PRISM_ModuleDrugHeatmap.pdf", bbox_inches="tight")

print("✅ Saved Fig 6B as PNG + PDF")


✅ Saved Fig 6B as PNG + PDF


In [35]:
# === Fig 6B — PRISM drugs vs modules (rows clustered, columns = modules) ===
# Assumes you already have:
#   heat : DataFrame of shape (4, n_drugs) with index = [IGE, SAT, IL2, MPC]
#          and columns = PRISM drug names.
# Also assumes numpy, pandas, matplotlib, scipy are installed.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
from scipy.cluster.hierarchy import linkage, leaves_list

# ---------- basic checks ----------
print("[DEBUG] heat shape:", heat.shape)
print("index:", list(heat.index))

# transpose so rows = drugs, cols = modules
data = heat.T.copy()   # shape (n_drugs, 4)
modules = list(data.columns)

# ---------- cluster drugs (rows) ----------
# use Euclidean distance on the 4D module-correlation vector per drug
row_link = linkage(data.values, method="average", metric="euclidean")
row_order = leaves_list(row_link)

data_ord = data.iloc[row_order]

# ---------- font: prefer Arial ----------
def _has_font(name: str) -> bool:
    return any(name.lower() in f.name.lower() for f in fm.fontManager.ttflist)

font_family = "Arial" if _has_font("Arial") else "DejaVu Sans"
plt.rcParams["font.family"] = font_family

# ---------- plot ----------
fig, ax = plt.subplots(figsize=(6, 8), dpi=300)

im = ax.imshow(
    data_ord.values,
    aspect="auto",
    cmap="bwr",
    vmin=-1.0,
    vmax=1.0,
)

# axes / labels
ax.set_yticks(np.arange(data_ord.shape[0]))
ax.set_yticklabels(data_ord.index, fontsize=6)  # each drug
ax.set_xticks(np.arange(len(modules)))
ax.set_xticklabels(modules, fontsize=9)

ax.set_ylabel("PRISM drug", fontsize=10)
ax.set_xlabel("Module", fontsize=10)

ax.set_title("PRISM drug sensitivity vs modules\n(drugs clustered by pattern)", fontsize=12, pad=8)

# colorbar
cbar = fig.colorbar(im, ax=ax)
cbar.set_label("Correlation (ρ)", fontsize=10)

fig.tight_layout()

OUT = "Fig6B_PRISM_ModuleDrugCorr_CLUSTERED"
fig.savefig(OUT + ".png", bbox_inches="tight")
fig.savefig(OUT + ".pdf", bbox_inches="tight")
plt.close(fig)

print(f"✅ Saved {OUT}.png and {OUT}.pdf")
print("   Rows (drugs):", data_ord.shape[0], " | Columns (modules):", data_ord.shape[1])


[DEBUG] heat shape: (4, 40)
index: ['SAT', 'IGE', 'IL2', 'MPC']
✅ Saved Fig6B_PRISM_ModuleDrugCorr_CLUSTERED.png and Fig6B_PRISM_ModuleDrugCorr_CLUSTERED.pdf
   Rows (drugs): 40  | Columns (modules): 4


In [37]:
# === Figure 6B — PRISM module–drug correlation heatmap with clustering ===
# Assumes you already have `heat` in memory:
#   - rows: modules (SAT, IGE, IL2, MPC)
#   - cols: drugs (e.g. "2,3-DCPE", "CHIR-98014", ...)

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm, colors, font_manager
from matplotlib import gridspec
from scipy.cluster.hierarchy import linkage, dendrogram, leaves_list
import pandas as pd

# ---------- 1. Prepare matrix (drugs x modules) ----------
if not isinstance(heat, pd.DataFrame):
    raise TypeError("This cell expects a pandas DataFrame `heat` already defined.")

# transpose so rows = drugs, cols = modules
mat = heat.T.copy()

# ensure column order is consistent (if you want a specific order pre-cluster)
modules = ["SAT", "IGE", "IL2", "MPC"]
modules = [m for m in modules if m in mat.columns]
mat = mat[modules]

# drop rows that are completely NA
mat = mat.loc[mat.notna().any(axis=1)]

# fill any remaining NAs with 0 (or you can use mat.mean().mean())
mat_filled = mat.fillna(0.0)

# ---------- 2. Hierarchical clustering ----------
# Row clustering (drugs)
row_link = linkage(mat_filled.values, method="average", metric="correlation")
row_order = leaves_list(row_link)

# Column clustering (modules)
col_link = linkage(mat_filled.values.T, method="average", metric="correlation")
col_order = leaves_list(col_link)

# Reorder matrix
mat_ord = mat_filled.iloc[row_order, :]
mat_ord = mat_ord.iloc[:, col_order]

row_labels = mat_ord.index.to_list()
col_labels = mat_ord.columns.to_list()

# ---------- 3. Set up figure layout ----------
fig = plt.figure(figsize=(7.5, 9), dpi=300)
gs = gridspec.GridSpec(
    2, 2,
    width_ratios=[0.3, 0.7],
    height_ratios=[0.25, 0.75],
    wspace=0.0,
    hspace=0.0,
)

ax_dtop = fig.add_subplot(gs[0, 1])   # top dendrogram (modules)
ax_dleft = fig.add_subplot(gs[1, 0])  # left dendrogram (drugs)
ax_hm = fig.add_subplot(gs[1, 1])     # main heatmap

# ---------- 4. Dendrograms ----------
# Top dendrogram (modules)
dendrogram(
    col_link,
    ax=ax_dtop,
    orientation="top",
    no_labels=True,
    color_threshold=None
)
ax_dtop.set_xticks([])
ax_dtop.set_yticks([])
ax_dtop.set_frame_on(False)

# Left dendrogram (drugs)
dendrogram(
    row_link,
    ax=ax_dleft,
    orientation="left",
    no_labels=True,
    color_threshold=None
)
ax_dleft.set_xticks([])
ax_dleft.set_yticks([])
ax_dleft.set_frame_on(False)

# ---------- 5. Heatmap ----------
vmin = np.nanmin(mat_ord.values)
vmax = np.nanmax(mat_ord.values)
# symmetric around 0 for correlations
lim = max(abs(vmin), abs(vmax))
norm = colors.TwoSlopeNorm(vmin=-lim, vcenter=0.0, vmax=lim)

im = ax_hm.imshow(
    mat_ord.values,
    aspect="auto",
    interpolation="none",
    cmap=cm.get_cmap("bwr"),
    norm=norm,
)

ax_hm.set_xticks(np.arange(len(col_labels)))
ax_hm.set_xticklabels(col_labels, rotation=0)
ax_hm.set_yticks(np.arange(len(row_labels)))
ax_hm.set_yticklabels(row_labels)

ax_hm.set_xlabel("Module")
ax_hm.set_ylabel("Drug (PRISM)")
ax_hm.set_title("PRISM module–drug correlation\n(clustered drugs and modules)", pad=8)

# ---------- 6. Colorbar ----------
cax = fig.add_axes([0.92, 0.15, 0.02, 0.7])  # [left, bottom, width, height]
cbar = fig.colorbar(im, cax=cax)
cbar.set_label("Spearman ρ (module vs PRISM AUC)")

# ---------- 7. Fonts (try Arial, fall back gracefully) ----------
def _has_font(name: str) -> bool:
    try:
        return any(name.lower() in f.name.lower() for f in font_manager.fontManager.ttflist)
    except Exception:
        return False

family = "Arial" if _has_font("Arial") else "DejaVu Sans"

for ax in [ax_hm, ax_dleft, ax_dtop]:
    for item in ([ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        try:
            item.set_fontfamily(family)
        except Exception:
            pass

try:
    ax_hm.title.set_fontfamily(family)
    cbar.ax.yaxis.label.set_fontfamily(family)
except Exception:
    pass

fig.tight_layout(rect=[0.0, 0.0, 0.9, 1.0])

out_base = "Fig6B_PRISM_ModuleDrugCorr_CLUSTERED"
fig.savefig(f"{out_base}.png", dpi=300, bbox_inches="tight")
try:
    fig.savefig(f"{out_base}.pdf", bbox_inches="tight")
except Exception:
    pass

plt.close(fig)
print(f"✅ Saved {out_base}.png (and .pdf if possible)")
print("   Rows (drugs):", len(row_labels), "Cols (modules):", len(col_labels))


  cmap=cm.get_cmap("bwr"),
  fig.tight_layout(rect=[0.0, 0.0, 0.9, 1.0])


✅ Saved Fig6B_PRISM_ModuleDrugCorr_CLUSTERED.png (and .pdf if possible)
   Rows (drugs): 40 Cols (modules): 4


In [42]:
# ----------------------------------------------------
# 3) Plot heatmap + bottom dendrogram (balanced layout)
# ----------------------------------------------------

fig = plt.figure(figsize=(8.4, 5.6), dpi=300)
gs  = gridspec.GridSpec(
    2, 2,
    height_ratios=[10, 2],   # HEATMAP gets room; dendrogram compressed
    width_ratios=[4, 0.25],
    hspace=0.02, wspace=0.05
)

# ----------------------- HEATMAP -----------------------
ax_hm = fig.add_subplot(gs[0, 0])
im = ax_hm.imshow(
    mat.values,
    aspect="auto",
    cmap="coolwarm",
    vmin=-1.0, vmax=1.0
)

ax_hm.set_yticks(np.arange(len(modules)))
ax_hm.set_yticklabels(modules, fontsize=9)

# Keep x-ticks hidden (dendrogram labels will serve this)
ax_hm.set_xticks([])

ax_hm.set_ylabel("Module", fontsize=11)
ax_hm.set_title("PRISM drug sensitivity vs. module activity", fontsize=12, pad=12)

# ----------------------- DENDROGRAM ---------------------
ax_den = fig.add_subplot(gs[1, 0])

dendrogram(
    col_link,
    labels=drugs,
    ax=ax_den,
    leaf_rotation=90,
    leaf_font_size=5,     # small = compact = no more giant white space
    color_threshold=None
)

# Clean dendrogram axis
ax_den.set_yticks([])
ax_den.set_ylabel("Drug\nhierarchy", fontsize=8, labelpad=4)

# Compress vertical whitespace under dendrogram
ymin, ymax = ax_den.get_ylim()
ax_den.set_ylim(ymin, ymax * 0.35)   # squash bottom 65%

# ----------------------- COLORBAR ------------------------
ax_cb = fig.add_subplot(gs[:, 1])
cbar = fig.colorbar(im, cax=ax_cb)
cbar.set_label("Correlation (ρ)", fontsize=10)

# ----------------------- FONTS ---------------------------
for ax in [ax_hm, ax_den, ax_cb]:
    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                 ax.get_xticklabels() + ax.get_yticklabels()):
        try:
            item.set_fontfamily("Arial")
        except Exception:
            pass

# ----------------------- SAVE ----------------------------
fig.tight_layout()
fig.savefig("Fig6B_PRISM_ModuleDrugHeatmap.fixed.png", bbox_inches="tight")
fig.savefig("Fig6B_PRISM_ModuleDrugHeatmap.fixed.pdf", bbox_inches="tight")
plt.close(fig)

print("✨ Saved fixed PRISM heatmap")


  fig.tight_layout()


✨ Saved fixed PRISM heatmap


In [27]:
# --- Figure 6C bubble-strip with clear MODULE encoding ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

# ---------- CONFIG ----------
CSV_PATH   = "module_vs_PRISM_top15.csv"
LABEL_MODE = "truncate"   # "truncate" or "wrap"
WRAP_WIDTH = 18
TRUNC_W    = 22
Y_FONTSZ   = 7
X_FONTSZ   = 10
FIGSIZE    = (9.5, 8.2)
OUT_PNG    = "Figure6C_bubble_strip_byMODULE.png"
OUT_PDF    = "Figure6C_bubble_strip_byMODULE.pdf"

module_order  = ['IGE','SAT','IL2','MPC']
module_colors = {'IGE':'#3B6BA5', 'SAT':'#E49C3C', 'IL2':'#CA4949', 'MPC':'#60AD7D'}
# light ribbons behind each module (same hues, high alpha)
ribbon_alpha  = 0.08
# --------------------------------

def shorten(s, width=22):
    return s if len(s) <= width else s[:width-1] + "…"

def wrap(s, width=18):
    return "\n".join(textwrap.wrap(s, width=width)) if len(s) > width else s

# Load + prep
df = pd.read_csv(CSV_PATH)
df['logp'] = -np.log10(df['pval'].clip(lower=1e-300))
df['module'] = pd.Categorical(df['module'], module_order, ordered=True)

# label compactness
if LABEL_MODE == "wrap":
    df['drug_label'] = df['drug'].apply(lambda s: wrap(s, WRAP_WIDTH))
else:
    df['drug_label'] = df['drug'].apply(lambda s: shorten(s, TRUNC_W))

# Sensitivity/resistance marker
df['effect'] = np.where(df['rho'] < 0, 'sensitive (ρ<0)', 'resistant (ρ≥0)')

# sort so modules appear as contiguous blocks
df = df.sort_values(['module','rho'], ascending=[True, True])

plt.figure(figsize=FIGSIZE)
ax = sns.scatterplot(
    data=df, x='rho', y='drug_label',
    hue='module', palette=module_colors,
    style='effect', style_order=['sensitive (ρ<0)','resistant (ρ≥0)'],
    sizes=(20, 220), size='logp',
    edgecolor='k', linewidth=0.3
)

# neutral zone + axis styling
ax.axvline(0, color='gray', lw=0.8)
ax.axvspan(-0.1, 0.1, color='lightgray', alpha=0.18, zorder=-1)
ax.set_xlim(-1.1, 1.1)
ax.set_xlabel('Spearman ρ (drug–module correlation, negative = ↑ sensitivity)', fontsize=X_FONTSZ)
ax.set_ylabel('', fontsize=Y_FONTSZ)
ax.set_title('Module-specific PRISM drug–response correlations', fontsize=14, pad=10)

# Smaller y labels
ax.tick_params(axis='y', labelsize=Y_FONTSZ, pad=1)
ax.tick_params(axis='x', labelsize=X_FONTSZ)

# OPTIONAL: faint ribbons behind each module block
# Need the y positions used by Matplotlib for each categorical label
y_labels = [t.get_text() for t in ax.get_yticklabels()]
ypos = {lab: i for i, lab in enumerate(y_labels)}  # 0 at bottom

# Determine continuous ranges for each module
for m in module_order:
    labs = df.loc[df['module']==m, 'drug_label'].tolist()
    if not labs:
        continue
    ys = sorted([ypos[l] for l in labs])
    y0, y1 = ys[0]-0.5, ys[-1]+0.5
    ax.axhspan(y0, y1, color=module_colors[m], alpha=ribbon_alpha, zorder=-2)
    # module tag on right margin at band mid
    ym = (y0 + y1)/2
    ax.text(1.06, (ym - (-0.5))/(len(y_labels)), m, color=module_colors[m],
            fontsize=10, fontweight='bold', rotation=0,
            transform=ax.get_yaxis_transform())

# Layout: room for labels + legend outside
plt.subplots_adjust(left=0.36, right=0.77, top=0.92, bottom=0.08)
leg = ax.legend(bbox_to_anchor=(1.02, 1.0), loc='upper left', frameon=True, borderpad=0.6, title=None)

# Save (non-interactive safe)
plt.savefig(OUT_PNG, dpi=300, bbox_inches='tight')
plt.savefig(OUT_PDF, dpi=300, bbox_inches='tight')
print(f"Saved: {OUT_PNG} and {OUT_PDF}")


AttributeError: module 'matplotlib.cm' has no attribute 'register_cmap'

In [None]:
import networkx as nx
G = nx.Graph()
modules = ['IGE','SAT','IL2','MPC']
for m in modules:
    G.add_node(m)
for i,a in enumerate(modules):
    for b in modules[i+1:]:
        shared = set(df[df['module']==a]['drug']) & set(df[df['module']==b]['drug'])
        if shared:
            G.add_edge(a,b,weight=len(shared))
pos = nx.circular_layout(G)
w = [d['weight'] for (_,_,d) in G.edges(data=True)]
nx.draw(G,pos,width=[0.3*x for x in w],with_labels=True,node_color='white',edge_color='gray',node_size=1500)
plt.title('Shared drug associations between modules')
plt.show()


In [None]:
import plotly.graph_objects as go
modules = df['module'].unique()
moa_map = {'talazoparib':'PARP','alpelisib':'PI3K','HSP90 inhibitor':'HSP90', ...}
df['MOA'] = df['drug'].map(moa_map)
links = df.groupby(['module','MOA']).size().reset_index(name='count')
labels = list(modules) + list(links['MOA'].dropna().unique())
src = [labels.index(x) for x in links['module']]
tgt = [labels.index(x) for x in links['MOA']]
val = links['count']
go.Figure(go.Sankey(node=dict(label=labels),
                    link=dict(source=src,target=tgt,value=val))
          ).update_layout(title='Module-to-MOA drug-response mapping').show()
