
# Learning Series #5 — Multi-Omics Mini-Pipeline (Expression + Mutations + Survival)

Merge expression, mutations (MAF-like), and clinical data; build combined prognostic models and plots.


In [None]:

# !pip install pandas numpy lifelines matplotlib scipy statsmodels
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test
import os

plt.rcParams["figure.dpi"] = 140
plt.rcParams["savefig.bbox"] = "tight"

OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)
print("Saving outputs to:", OUT_DIR)


In [None]:

clinical_path = "example_clinical.csv"
expr_path = "example_expression.csv"
maf_path = "example_maf.csv"

clinical = pd.read_csv(clinical_path)
expr     = pd.read_csv(expr_path)
maf      = pd.read_csv(maf_path)

assert {"sample_id","OS_time","OS_event"}.issubset(set(clinical.columns))
assert "sample_id" in expr.columns
assert {"Tumor_Sample_Barcode","Hugo_Symbol"}.issubset(set(maf.columns))

maf = maf.rename(columns={"Tumor_Sample_Barcode":"sample_id"})

common = set(clinical["sample_id"]).intersection(set(expr["sample_id"])).intersection(set(maf["sample_id"]))
clinical = clinical[clinical["sample_id"].isin(common)].reset_index(drop=True)
expr     = expr[expr["sample_id"].isin(common)].reset_index(drop=True)
maf      = maf[maf["sample_id"].isin(common)].reset_index(drop=True)

print("N samples:", len(common))
clinical.head()


In [None]:

# Feature engineering
mutated = maf.groupby(["sample_id"])["Hugo_Symbol"].nunique().rename("mut_burden").reset_index()

drivers = ["TP53","KRAS","EGFR","PIK3CA","BRAF","PTEN"]
driver_flags = (
    maf.assign(flag=1)
       .pivot_table(index="sample_id", columns="Hugo_Symbol", values="flag", aggfunc="max", fill_value=0)
       .reindex(columns=drivers, fill_value=0)
       .reset_index()
)
driver_flags.columns = ["sample_id"] + [f"{g}_mut" for g in drivers]

df_expr = expr.copy()
for col in df_expr.columns:
    if col != "sample_id":
        z = (df_expr[col] - df_expr[col].mean()) / (df_expr[col].std() + 1e-8)
        df_expr[col + "_z"] = z

if {"CD8A_z","GZMB_z"}.issubset(set(df_expr.columns)):
    df_expr["immune_sig"] = df_expr[["CD8A_z","GZMB_z"]].mean(axis=1)
else:
    df_expr["immune_sig"] = np.nan

expr_feats = df_expr[["sample_id","CD274","MKI67","immune_sig"]].copy()

df = clinical.merge(mutated, on="sample_id", how="left").merge(driver_flags, on="sample_id", how="left").merge(expr_feats, on="sample_id", how="left")
df = df.fillna(0)

cd274_cut = df["CD274"].median()
burden_cut = df["mut_burden"].median()
df["combo_group"] = np.where(df["CD274"] >= cd274_cut, "HighPDL1", "LowPDL1") + "_" + np.where(df["mut_burden"] >= burden_cut, "HighBurden", "LowBurden")

df.to_csv(os.path.join(OUT_DIR, "feature_table.csv"), index=False)
df.head()


In [None]:

# Exploratory plots
fig = plt.figure(figsize=(5,4))
plt.scatter(df["mut_burden"], df["CD274"])
plt.xlabel("Mutation burden (# genes mutated)")
plt.ylabel("CD274 expression")
plt.title("Mutation burden vs CD274")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "scatter_burden_vs_cd274.png"), dpi=300)
plt.savefig(os.path.join(OUT_DIR, "scatter_burden_vs_cd274.svg"))
plt.show()


In [None]:

# Oncoplot-like heatmap
top_genes = maf["Hugo_Symbol"].value_counts().head(20).index.tolist()
mat = pd.DataFrame(0, index=df["sample_id"], columns=top_genes)
for _, r in maf.iterrows():
    s, g = r["sample_id"], r["Hugo_Symbol"]
    if g in mat.columns and s in mat.index:
        mat.loc[s, g] = 1

order = df.sort_values("mut_burden", ascending=False)["sample_id"]
mat = mat.reindex(index=order)

fig = plt.figure(figsize=(max(6, 0.35*mat.shape[1]), max(4, 0.2*mat.shape[0])))
plt.imshow(mat.values, aspect="auto")
plt.xlabel("Genes")
plt.ylabel("Samples")
plt.title("Oncoplot-like heatmap (1=mutated)")
plt.xticks(ticks=np.arange(mat.shape[1]), labels=mat.columns, rotation=90)
plt.yticks([])
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "oncoplot_like_heatmap.png"), dpi=300)
plt.savefig(os.path.join(OUT_DIR, "oncoplot_like_heatmap.svg"))
plt.show()


In [None]:

# Survival — KM by combined groups
time_col, event_col = "OS_time", "OS_event"
kmf = KaplanMeierFitter()

fig = plt.figure(figsize=(6,4))
for label in sorted(df["combo_group"].unique()):
    m = df["combo_group"] == label
    if m.sum() < 3:
        continue
    kmf.fit(df.loc[m, time_col], df.loc[m, event_col], label=label)
    kmf.plot_survival_function()
plt.title("KM: Combined PD-L1 × Mutation Burden groups")
plt.xlabel("Time")
plt.ylabel("Survival probability")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "km_combo_group.png"), dpi=300)
plt.savefig(os.path.join(OUT_DIR, "km_combo_group.svg"))
plt.show()


In [None]:

# Cox PH — Multi-omics
cox = CoxPHFitter()
covariates = [c for c in ["age","grade","stage"] if c in df.columns]
model_vars = ["mut_burden","CD274","MKI67","TP53_mut","KRAS_mut","EGFR_mut"] + covariates

cox_df = df[["OS_time","OS_event"] + model_vars].copy()
cox_df = cox_df.rename(columns={"OS_time":"T","OS_event":"E"}).dropna()

cox.fit(cox_df, duration_col="T", event_col="E")
cox.print_summary()

summary = cox.summary[["coef","exp(coef)","p","exp(coef) lower 95%","exp(coef) upper 95%"]]
summary.to_csv(os.path.join(OUT_DIR, "cox_summary_multiomics.csv"))
summary.head()


In [None]:

# Forest-like plot
S = summary.reset_index().rename(columns={"index":"term","exp(coef)":"HR","exp(coef) lower 95%":"HR_lower","exp(coef) upper 95%":"HR_upper"})
fig = plt.figure(figsize=(6, max(2, 0.5*len(S)+1)))
y = np.arange(len(S))
plt.errorbar(S["HR"], y, xerr=[S["HR"]-S["HR_lower"], S["HR_upper"]-S["HR"]], fmt="o")
plt.axvline(1.0, linestyle="--")
plt.yticks(y, S["term"])
plt.xlabel("Hazard Ratio (HR)")
plt.title("Multivariable Cox — Multi-Omics")
plt.grid(True, axis="x", alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "forest_cox_multiomics.png"), dpi=300)
plt.savefig(os.path.join(OUT_DIR, "forest_cox_multiomics.svg"))
plt.show()


In [None]:

# Save a top mutated genes summary too
top_table = maf["Hugo_Symbol"].value_counts().reset_index()
top_table.columns = ["gene","mut_count"]
top_table["freq"] = top_table["mut_count"] / df.shape[0]
top_table.to_csv(os.path.join(OUT_DIR, "top_mutated_genes.csv"), index=False)
top_table.head(10)


In [None]:

# Template: TCGA reformat notes
# (See README notes for guidance)
pass
