# 3. Analysis of ecDNA regions for the sample B4

In [None]:
import sys
import warnings

import decoupler as dc
import gseapy as gp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scikit_posthocs as sp
import seaborn as sns
import statsmodels.api as sm
from adjustText import adjust_text
from gseapy import Msigdb, dotplot
from helper_functions import select_slide
from matplotlib.patches import Patch
from plotting_settings import PLOTTING_PARAMS
from scipy.stats import fisher_exact, kruskal, mannwhitneyu, pearsonr
from sklearn.mixture import GaussianMixture as GMM
from statannotations.Annotator import Annotator
from statsmodels.stats.multitest import multipletests

warnings.filterwarnings("ignore")
sys.path.insert(1, "../../helper_functions")

In [None]:
def read_genes(sample):
    genes = []
    with open(f"../../data/{sample}_ecDNA_genes.txt", "r") as fin:
        for line in fin:
            genes.append(line.rstrip())
    return genes

In [None]:
def highly_expressed_genes(genes, target_sample, adata):
    samples = adata.obs["sample"].unique()

    median_expr = pd.DataFrame(index=list(genes), columns=samples)

    for sample in samples:
        sample_mask = adata.obs["sample"] == sample
        sample_expr = adata[sample_mask, list(genes)].X
        median_vals = np.array(sample_expr.mean(axis=0)).flatten()
        median_expr.loc[:, sample] = median_vals

    genes_highest_in_x = median_expr.idxmax(axis=1) == target_sample
    selected_genes = median_expr.index[genes_highest_in_x].tolist()

    return sorted(selected_genes)

In [None]:
adata = sc.read_h5ad("../../data/merged_samples.h5ad")
metadata = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/metadata_all.csv", index_col=0
)

adata.obs = adata.obs.join(metadata)
adata = adata[adata.obs["copykat.pred"] == "aneuploid", :].copy()

In [None]:
ecDNA_samples = ["B4", "B42", "B123"]

In [None]:
b42_signature = read_genes("B42")
b4_signature = read_genes("B4")
b123_signature = read_genes("B123")

In [None]:
b42_signature = set(b42_signature).intersection(adata.var.index)
b4_signature = set(b4_signature).intersection(adata.var.index)
b123_signature = set(b123_signature).intersection(adata.var.index)

In [None]:
B42_genes = highly_expressed_genes(b42_signature, "B42", adata)
B4_genes = highly_expressed_genes(b4_signature, "B4", adata)
B123_genes = highly_expressed_genes(b123_signature, "B123", adata)

In [None]:
net = pd.DataFrame({"genesymbol": B42_genes + B4_genes + B123_genes})
net["collection"] = "custom"
net["geneset"] = (
    ["B42"] * len(B42_genes) + ["B4"] * len(B4_genes) + ["B123"] * len(B123_genes)
)
net

In [None]:
dc.run_aucell(adata, net=net, source="geneset", target="genesymbol")

In [None]:
adata.obs = adata.obs.join(adata.obsm["aucell_estimate"])

In [None]:
adata.write_h5ad("../../aucell_adata.h5ad", compression="gzip", compression_opts=9)

## Sample B4

In [None]:
adata = sc.read_h5ad("../../aucell_adata.h5ad")
adata.obs["sample"] = pd.Categorical(
    adata.obs["sample"],
    categories=[
        "B22",
        "B24",
        "B60",
        "B154",
        "B156",
        "B175",
        "B178",
        "B4",
        "B42",
        "B123",
    ],
    ordered=True,
)
b4 = select_slide(adata, "B4")

spatial = sc.read_h5ad("../../../spatial_transcriptomics/SpatialDE/h5ad/B4.h5ad")
b4.obs = b4.obs.join(spatial.obs[["segmentation_labels"]])

cnv_scores = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/CNV_scores.csv", index_col=0
)
b4.obs = b4.obs.join(cnv_scores)

In [None]:
data = [
    adata.obs.loc[ids, "B4"].values
    for ids in adata.obs.groupby("sample").groups.values()
]

In [None]:
H, p = kruskal(*data)
p

In [None]:
sp.posthoc_dunn(adata.obs, val_col="B4", group_col="sample", p_adjust="fdr_bh")

In [None]:
sns.set_theme(style="white", rc=PLOTTING_PARAMS)

x = "sample"
y = "B4"
order = adata.obs["sample"].unique().categories
pairs = [("B4", s) for s in adata.obs["sample"].unique() if s != "B4"]

ax = sns.boxplot(adata.obs, x="sample", y="B4")
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xlabel("Sample")
plt.ylabel("Enrichment of B4-specific\nsignature (AUC)")

annotator = Annotator(ax, pairs, data=adata.obs, x=x, y=y, order=order)
# only used to plot significance
annotator.configure(
    test="Mann-Whitney", text_format="star", loc="inside", comparisons_correction="BH"
)
annotator.apply_and_annotate()


plt.savefig("fig4A_boxplot_B4_signature_cohort.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
df = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/B4_filtered/B4_copykat_bins.csv",
    index_col=0,
)
df

In [None]:
gmm = GMM(n_components=3, max_iter=1000, random_state=10, covariance_type="full")
# fit GMM to the entire dataset of copykat smoothed expressions
gmm.fit(df.to_numpy().flatten().reshape(-1, 1))

In [None]:
mean = gmm.means_
covs = gmm.covariances_
weights = gmm.weights_

In [None]:
mean

In [None]:
# extract positions with ecDNA
# chr12: 63920267-69690820
# also part of the amplicon: chr19: 19412515-20369480, chr19: 20374358-22958358, chr19: 22959124-23734013
# but driver genes are on chr12

ecDNA = df.loc[:, df.columns.str.startswith("chr12:6")]
# capturing a bit less than entire chr12 amplicon, just to be sure
sub = ecDNA.loc[:, ecDNA.columns[60:]]

In [None]:
sub

In [None]:
sub.columns

In [None]:
# just to be careful, we define amplification if log ratio is over the mean of the highest component (the one for gain)
amplifications = sub[sub.mean(axis=1) > mean[2][0]].index
amplifications

In [None]:
amplifications = [f"{code}_B4" for code in amplifications]
b4.obs["amplification"] = b4.obs.index.isin(amplifications)

#### Run `../../scripts/spatial/plot_heatmap_chr12_B4.R` to plot the amplification from figure 4A

In [None]:
clones = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/B4_filtered/B4_leiden_subclones.csv",
    index_col=0,
)
clones.index = clones.index + "_B4"
clones

In [None]:
b4.obs = b4.obs.join(clones)
# removing spots that were classified as diploid with copykat
b4 = b4[b4.obs["subclones"] != "diploid", :]

In [None]:
sc.pl.spatial(b4, color=["amplification", "subclones", "segmentation_labels"], size=1.5)

In [None]:
# Regressing out CNV score, which is inversely correlated to ESTIMATE score
X = sm.add_constant(b4.obs["cnv_score"])
model = sm.OLS(b4.obs["B4"], X).fit()

residuals = model.resid

b4.obs["B4_regressed"] = residuals

In [None]:
sns.set_style(style="white", rc=PLOTTING_PARAMS)
fig, axes = plt.subplots(2, 1, figsize=(6, 8), sharex=True)
sns.scatterplot(b4.obs, x="cnv_score", y="B4", ax=axes[0])
sns.despine(top=True, right=True, left=False, bottom=False)
sns.scatterplot(b4.obs, x="cnv_score", y="B4_regressed", ax=axes[1])
sns.despine(top=True, right=True, left=False, bottom=False)
axes[0].set_xlabel("CNV score")
axes[1].set_xlabel("CNV score")
axes[0].set_ylabel("Enrichment of B4\necDNA signature")
axes[1].set_ylabel("Regressed enrichment of\nB4 ecDNA signature")

plt.tight_layout()
plt.savefig("suppfigure_11A_B4_ecDNA_signature.svg", dpi=300)
plt.show()
plt.close()

In [None]:
print(pearsonr(b4.obs["cnv_score"], b4.obs["B4"]))
print(pearsonr(b4.obs["cnv_score"], b4.obs["B4_regressed"]))

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(6, 8), sharex=True)
sns.boxplot(
    b4.obs,
    x="amplification",
    y="B4",
    ax=axes[0],
    hue="amplification",
    palette=["#949494", "#78D3D3"],
    legend=False,
)
sns.despine(top=True, right=True, left=False, bottom=False)
sns.boxplot(
    b4.obs,
    x="amplification",
    y="B4_regressed",
    ax=axes[1],
    hue="amplification",
    palette=["#949494", "#78D3D3"],
    legend=False,
)
sns.despine(top=True, right=True, left=False, bottom=False)
axes[0].set_xlabel("Amplification")
axes[1].set_xlabel("Amplification")
axes[0].set_ylabel("Enrichment of B4\necDNA signature")
axes[1].set_ylabel("Regressed enrichment of\nB4 ecDNA signature")

plt.tight_layout()
# plt.savefig("suppfigure_11B_B4_ecDNA_signature_amplification.svg", dpi=300)
plt.show()
plt.close()

In [None]:
print(
    mannwhitneyu(
        b4.obs[b4.obs["amplification"] == True]["B4"],
        b4.obs[b4.obs["amplification"] == False]["B4"],
        alternative="greater",
    )
)
print(
    mannwhitneyu(
        b4.obs[b4.obs["amplification"] == True]["B4_regressed"],
        b4.obs[b4.obs["amplification"] == False]["B4_regressed"],
        alternative="greater",
    )
)

In [None]:
# let's split regressed B4 signature into high and low
b4.obs["B4_bin"] = pd.qcut(
    b4.obs["B4_regressed"], [0, 0.5, 1.0], labels=["low", "high"]
)
# annotate ecDNA status only if regressed B4 signature is high and amplification was detected for the genomic bins used above
b4.obs["ecDNA_status"] = (b4.obs["B4_bin"] == "high") & (
    b4.obs["amplification"] == True
)
b4.obs["ecDNA_status"] = np.where(
    b4.obs["ecDNA_status"] == True, "ecDNA-positive", "ecDNA-negative"
)

In [None]:
# sns.reset_defaults()
sc.pl.spatial(
    b4,
    color=["B4_regressed", "amplification", "B4_bin", "ecDNA_status"],
    title=[
        "B4 regressed",
        "Amplification",
        "Binarized B4 signature",
        "Final ecDNA status",
    ],
    palette=["#949494", "#78D3D3"],
    cmap="viridis",
    size=1.5,
    ncols=2,
    vmax="p99",
    show=False,
)
plt.savefig("suppfigure_11C_B4sig_amplification.svg", dpi=300)
plt.show()
plt.close()

We see that two distinct regions are ecDNA positive. Because we cannot be sure if ecDNA positive and ecDNA negative regions have comparable tumor content, we will try to compare these two distinct regions.

In [None]:
# Segmentation labels from SpatialDE2
b4.obs["segmentation_labels"].value_counts()

### Fisher's exact test to find spatialDE clusters enriched with ecDNA

In [None]:
# Removing clusters with less than 5 observations
data = (
    b4[~b4.obs["segmentation_labels"].isin([8, 10, 11]), :]
    .obs.groupby("ecDNA_status")["segmentation_labels"]
    .value_counts()
    .reset_index()
)

results = []
all_true = (
    data[data["ecDNA_status"] == "ecDNA-positive"]
    .groupby("segmentation_labels")["count"]
    .sum()
)
all_false = (
    data[data["ecDNA_status"] == "ecDNA-negative"]
    .groupby("segmentation_labels")["count"]
    .sum()
)
total_true = data[data["ecDNA_status"] == "ecDNA-positive"]["count"].sum()
total_false = data[data["ecDNA_status"] == "ecDNA-negative"]["count"].sum()

for cl in data["segmentation_labels"].unique():
    a = all_true.get(cl, 0)
    b = all_false.get(cl, 0)
    c = total_true - a
    d = total_false - b
    table = [[a, b], [c, d]]
    oddsratio, pvalue = fisher_exact(table, alternative="greater")
    results.append({"segmentation_labels": cl, "pvalue": pvalue, "a": a, "b": b})

results_df = pd.DataFrame(results)
results_df["padj"] = multipletests(results_df["pvalue"], method="fdr_bh")[1]

results_df.sort_values("padj")

In [None]:
sc.pl.spatial(
    b4[~b4.obs["segmentation_labels"].isin([8, 10, 11]), :],
    color=["segmentation_labels"],
    title=["SpatialDE2 cluster"],
    size=1.5,
    show=False,
    palette=sns.color_palette("colorblind"),
)
plt.savefig("fig4C_spatial_spatialde2.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
sc.pl.spatial(
    b4[~b4.obs["segmentation_labels"].isin([8, 10, 11]), :],
    color=["ecDNA_status"],
    title=["ecDNA"],
    size=1.5,
    show=False,
    palette=["#949494", "#78D3D3"],
)
plt.savefig("fig4D_spatial_spatialde2_ecDNA.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
pivoted_df = data.pivot(
    columns="ecDNA_status", index="segmentation_labels", values="count"
)
ax = pivoted_df.div(pivoted_df.sum(axis=1), axis=0).plot.bar(
    stacked=True, width=0.9, figsize=(6, 4), color=["#949494", "#78D3D3"]
)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.xticks(rotation=0)
plt.xlabel("SpatialDE2 cluster", size=12)
plt.ylabel("Fraction of spots\nper cluster", size=12)
plt.legend(title="ecDNA status", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"fig4E_barplot_ecdna_spatialde_v2.svg", dpi=300)
plt.show()
plt.close()

We see that clusters 0 and 1 are enriched for ecDNA.

### Fisher's exact test to find subclones enriched with ecDNA

In [None]:
# let's try the same for subclones
data = b4.obs.groupby("ecDNA_status")["subclones"].value_counts().reset_index()

results = []
all_true = (
    data[data["ecDNA_status"] == "ecDNA-positive"].groupby("subclones")["count"].sum()
)
all_false = (
    data[data["ecDNA_status"] == "ecDNA-negative"].groupby("subclones")["count"].sum()
)
total_true = data[data["ecDNA_status"] == "ecDNA-positive"]["count"].sum()
total_false = data[data["ecDNA_status"] == "ecDNA-negative"]["count"].sum()

for subclone in data["subclones"].unique():
    a = all_true.get(subclone, 0)
    b = all_false.get(subclone, 0)
    c = total_true - a
    d = total_false - b
    table = [[a, b], [c, d]]
    oddsratio, pvalue = fisher_exact(
        table, alternative="greater"
    )  # Enrichment = greater
    results.append({"subclones": subclone, "pvalue": pvalue, "a": a, "b": b})

# Adjust p-values
results_df = pd.DataFrame(results)
results_df["padj"] = multipletests(results_df["pvalue"], method="fdr_bh")[1]

results_df.sort_values("padj").reindex()

In [None]:
pivoted_df = data.pivot(columns="ecDNA_status", index="subclones", values="count")
ax = pivoted_df.div(pivoted_df.sum(axis=1), axis=0).plot.bar(
    stacked=True, width=0.9, figsize=(6, 4), color=["#949494", "#78D3D3"]
)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.xticks(rotation=0)
plt.xlabel("Subclones")
plt.ylabel("Fraction of spots\nper cluster")
plt.legend(title="ecDNA status", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"B4_filtered/figures/barplot_ecdna_subclones.svg", dpi=300)
plt.show()
plt.close()

In [None]:
sc.pl.spatial(
    b4,
    color=["subclones"],
    title=["Subclones"],
    size=1.5,
    show=False,
    palette=sns.color_palette("colorblind"),
)
plt.savefig("suppfig_11D_spatial_subclones.svg", dpi=300)
plt.show()
plt.close()

Although we still see enrichment of two subclones with ecDNA, ecDNA status just visually doesn't correspond as good to subclones as to spatialDE segments. So we will focus on comparison of the segments.

In [None]:
# Assign spatial clusters 0 and 1 from spatialDE2 as ecDNA enriched
b4.obs["ecDNA_enrichment"] = b4.obs["segmentation_labels"].isin([0, 1])

#### Compare cell type composition

In [None]:
cellt = [
    "CDH12_Epithelial",
    "Cycling_Epithelial",
    "KRT6A_Epithelial",
    "KRT13_Epithelial",
    "UPK_Epithelial",
    "ACTA2_Fibroblast",
    "FAP_Fibroblast",
    "PDGFRB_Fibroblast",
    "PDPN_Fibroblast",
    "Endothelial",
    "Inflam_Macrophage",
    "MHCII_Macrophage",
    "Dendritic_cell",
    "CD20_Bcell",
    "Plasma_Bcell",
    "CD8T",
    "Naive_Tcell",
    "Treg",
    "Normal_Smooth_muscle",
]

In [None]:
cellt_newnames = {
    "CDH12_Epithelial": "Immune-inflitrated epithelial",
    "Cycling_Epithelial": "Cycling epithelial",
    "KRT6A_Epithelial": "Basal epithelial",
    "KRT13_Epithelial": "Luminal epithelial (KRT13+)",
    "UPK_Epithelial": "Luminal epithelial (UPK)",
    "ACTA2_Fibroblast": "mCAF",
    "FAP_Fibroblast": "iCAF (FAP+)",
    "PDGFRB_Fibroblast": "General CAF",
    "PDPN_Fibroblast": "iCAF (PDPN+)",
    "Endothelial": "Endothelial",
    "Inflam_Macrophage": "Inflammatory macrophage",
    "MHCII_Macrophage": "Antigen-presenting macrophage",
    "Dendritic_cell": "Dendritic cell",
    "CD20_Bcell": "B cell",
    "Plasma_Bcell": "Plasma cell",
    "CD8T": "CD8+ T cell",
    "Naive_Tcell": "Naive T cell",
    "Treg": "Regulatory T cell",
    "Normal_Smooth_muscle": "Normal smooth muscle",
}

In [None]:
# add celltype abundance
celltypes = sc.read_h5ad(
    "../../../spatial_transcriptomics/cell2location/visium_model_alpha20_N20_Gouin_muscle_merged/posteriors_adata.h5ad"
)
s = select_slide(celltypes, "B4")
b4.obs = b4.obs.join(s.obs[cellt])


del celltypes

In [None]:
df = b4.obs[cellt]
df = df.div(df.sum(axis=1), axis=0)

In [None]:
df.rename(columns=cellt_newnames, inplace=True)

In [None]:
df = df.join(b4.obs[["ecDNA_enrichment", "segmentation_labels"]]).melt(
    id_vars=["ecDNA_enrichment", "segmentation_labels"]
)
df

In [None]:
# let's only compare clusters 0 and 1
df_sub = df[df["segmentation_labels"].isin([0, 1])]
df_sub["segmentation_labels"] = pd.Categorical(
    df_sub["segmentation_labels"], categories=[0, 1]
)
df_sub["variable"] = df_sub["variable"].str.replace("_", " ")
df_sub

In [None]:
sns.boxplot(
    df_sub,
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=sns.color_palette("colorblind"),
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of cell type")
plt.legend(title="SpatialDE cluster")
plt.savefig(
    "suppfig_12B_celltypes_spatialde_segments.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()

In [None]:
df_sub.groupby("variable").apply(
    lambda x: mannwhitneyu(
        x.value[x.segmentation_labels == 0],
        x.value[x.segmentation_labels == 1],
        alternative="two-sided",
    ).pvalue
).rename("pval").reset_index().assign(
    padj=lambda x: multipletests(x.pval, method="fdr_bh")[1]
).sort_values(
    by="padj"
)

Let's focus just on epithelial cells, since stromal and immune cells are present in a minor fraction.

In [None]:
plt.figure(figsize=(6, 6))
sns.boxplot(
    df_sub[
        (df_sub["variable"].str.contains("epithelial"))
        & (df_sub["variable"] != "Immune-inflitrated epithelial")
    ],
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=sns.color_palette("colorblind"),
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of epithelial subtype")
plt.legend(title="SpatialDE cluster")
plt.savefig("fig4I_epithelial_spatialde_segments.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
df_sub[df_sub["variable"].str.contains("epithelial")].groupby("variable").apply(
    lambda x: mannwhitneyu(
        x.value[x.segmentation_labels == 0],
        x.value[x.segmentation_labels == 1],
        alternative="two-sided",
    ).pvalue
).rename("pval").reset_index().assign(
    padj=lambda x: multipletests(x.pval, method="fdr_bh")[1]
).sort_values(
    by="padj"
)

In [None]:
b4_sub = b4[b4.obs["segmentation_labels"].isin([0, 1]), :]
epi = b4_sub.obs[cellt]
# get fractional abundance per spot
epi = epi.div(epi.sum(axis=1), axis=0)
epi.columns = [f"{ct}_fraction" for ct in epi.columns]

In [None]:
# Let's check abundances of these subtypes in space
b4_sub = b4[b4.obs["segmentation_labels"].isin([0, 1]), :]
b4_sub.obs = b4_sub.obs.join(epi)
sc.pl.spatial(
    b4_sub,
    color=[
        "Cycling_Epithelial_fraction",
        "KRT6A_Epithelial_fraction",
        "KRT13_Epithelial_fraction",
        "UPK_Epithelial_fraction",
        "segmentation_labels",
    ],
    title=[
        "Cycling Epithelial",
        "Basal epithelial",
        "Luminal epithelial (KRT13+)",
        "Luminal epithelial (UPK)",
        "SpatialDE2 cluster",
    ],
    size=1.5,
    vmax="p99",
    ncols=2,
    show=False,
    cmap="viridis",
    palette=sns.color_palette("colorblind"),
)
plt.savefig("suppfig_12C_epithelial_spatialde_segments.svg", dpi=300)
plt.show()
plt.close()

Now let's check which genes are differentially expressed between these two regions.

In [None]:
b4_sub.obs["segmentation_labels"] = b4_sub.obs["segmentation_labels"].astype(str)
sc.tl.rank_genes_groups(b4_sub, "segmentation_labels", method="wilcoxon")

In [None]:
df = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["logfoldchanges"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
df.columns = ["logFC"]
g = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["pvals_adj"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
f = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["scores"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
df = df.join(g)
df.columns = ["logFC", "padj"]
df = df.join(f)
df.columns = ["logFC", "padj", "scores"]
df

In [None]:
df[(df["padj"] < 0.05) & (abs(df["logFC"]) >= 1.5)]

In [None]:
df[df["logFC"] > 6] = 6
df[df["logFC"] < -6] = -6
plt.figure(figsize=(10, 8))
plt.scatter(x=df["logFC"], y=df["padj"].apply(lambda x: -np.log10(x)), s=1)

# highlight down- or up- regulated genes
down = df[(df["logFC"] <= -1.5) & (df["padj"] <= 0.001)]
up = df[(df["logFC"] >= 1.5) & (df["padj"] <= 0.001)]

plt.scatter(
    x=down["logFC"],
    y=down["padj"].apply(lambda x: -np.log10(x)),
    s=3,
    label="Down-regulated",
    color="blue",
)
plt.scatter(
    x=up["logFC"],
    y=up["padj"].apply(lambda x: -np.log10(x)),
    s=3,
    label="Up-regulated",
    color="red",
)
texts = []
for i, r in df[:10].iterrows():
    if r["logFC"] >= 1.5:
        texts.append(plt.text(x=r["logFC"], y=-np.log10(r["padj"]), s=i, fontsize=16))
for i, r in df[-10:].iterrows():
    if r["logFC"] <= -1.5:
        texts.append(plt.text(x=r["logFC"], y=-np.log10(r["padj"]), s=i, fontsize=16))

adjust_text(
    texts,
    arrowprops=dict(arrowstyle="-", color="black", lw=0.5),
    expand_points=(1.4, 1.8),
    expand_text=(1.3, 1.6),
    force_points=0.2,
    force_text=0.3,
    lim=300,
    only_move={"points": "y", "texts": "y"},
)
plt.xlabel("logFC")
plt.ylabel("-log10(FDR)")
plt.xlim(-5, 5)
plt.axvline(-1.5, color="grey", linestyle="--")
plt.axvline(1.5, color="grey", linestyle="--")
plt.axhline(3, color="grey", linestyle="--")
plt.savefig("fig4F_volcano_cl0_vs_cl1.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
rnk = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["scores"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
rnk.columns = [1]

In [None]:
msig = Msigdb()
pre_res = gp.prerank(
    rnk=rnk,  # or rnk = rnk,
    gene_sets=msig.get_gmt(),
    threads=4,
    min_size=5,
    max_size=1000,
    permutation_num=1000,  # reduce number to speed up testing
    outdir=None,  # don't write to disk
    seed=6,
    verbose=True,  # see what's going on behind the scenes
)

In [None]:
pre_res.res2d.to_csv("../../data/GSEA_GO_cl0_cl1.csv")

In [None]:
pre_res.res2d = pre_res.res2d[pre_res.res2d["FDR q-val"] < 0.05]

In [None]:
pre_res.res2d["Term"] = (
    pre_res.res2d["Term"].str.replace("HALLMARK_", "").str.replace("_", " ")
)

In [None]:
color = "#336E23"

In [None]:
sns.reset_defaults()
ax = dotplot(
    pre_res.res2d,
    column="FDR q-val",
    cmap=plt.cm.viridis,
    size=6,
    figsize=(6, 6),
    cutoff=0.25,
    show_ring=False,
)
plt.savefig("fig4H_volcano_cl0_vs_cl1.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
toplot = pre_res.res2d[:10].sort_values("NES").reset_index()

In [None]:
toplot["significant"] = toplot["FDR q-val"] < 0.05
colors = toplot["significant"].map({True: "#336E23", False: "#D59F3F"})

In [None]:
sns.reset_defaults()
plt.figure(figsize=(6, 6))
plt.barh(toplot["Term"], toplot["NES"], color=colors)
plt.axvline(0, color="black", linewidth=0.8, linestyle="dashed")
plt.xlabel(
    "Cluster 1 <-- NES --> Cluster 0",
    size=16,
)
plt.yticks(size=16)
plt.xticks(size=16)
plt.title("GSEA: Cluster 0 versus cluster 1", size=16)

legend_elements = [
    Patch(facecolor="#D59F3F", label="False"),
    Patch(facecolor="#336E23", label="True"),
]
plt.legend(
    handles=legend_elements,
    title="FDR q-value\n< 0.05",
    loc="best",
    fontsize=12,
    title_fontsize=14,
)

# plt.tight_layout()
plt.savefig("fig4H_barplot_cl0_vs_cl1.svg", dpi=300, bbox_inches="tight")
plt.savefig("fig4H_barplot_cl0_vs_cl1.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
expr = sc.get.obs_df(b4_sub, keys=["segmentation_labels", "APOBEC3B", "MDM2", "RAP1B"])

In [None]:
sns.set_theme(style="white", rc=PLOTTING_PARAMS)
sns.boxplot(
    expr,
    x="segmentation_labels",
    y="APOBEC3B",
    hue="segmentation_labels",
    palette=sns.color_palette("colorblind"),
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xlabel("SpatialDE2 cluster")
plt.ylabel("Normalized APOBEC3B\nexpression")
plt.ylim(0, 3.5)

plt.savefig(
    "fig4G_boxplot_apobec3b_spatialde_clusters.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()

In [None]:
mannwhitneyu(
    expr[expr["segmentation_labels"] == "0"]["APOBEC3B"],
    expr[expr["segmentation_labels"] == "1"]["APOBEC3B"],
)

### General overview of expression

In [None]:
adata = sc.read_h5ad("../../data/merged_samples.h5ad")
adata.obs["sample"] = pd.Categorical(
    adata.obs["sample"],
    categories=[
        "B22",
        "B24",
        "B60",
        "B154",
        "B156",
        "B175",
        "B178",
        "B4",
        "B42",
        "B123",
    ],
    ordered=True,
)
expressions = sc.get.obs_df(adata, keys=["sample", "APOBEC3B", "MDM2", "RAP1B"])

In [None]:
for gene in ["APOBEC3B"]:
    plt.figure(figsize=(6, 4))
    sns.boxplot(expressions, x="sample", y=gene)
    sns.despine(top=True, right=True, left=False, bottom=False)
    plt.xlabel("Sample")
    plt.ylabel(f"Normalized {gene}\nexpression")
    plt.xticks(rotation=45)
    plt.savefig(
        f"suppfig_12E_boxplot_{gene}_expression_cohort.svg",
        dpi=300,
        bbox_inches="tight",
    )
    plt.close()

## ecDNA vs non-ecDNA

In [None]:
# quick check to see which clusters have the highest CNV score
sns.boxplot(
    b4[~b4.obs["segmentation_labels"].isin([8, 10, 11]), :].obs,
    x="segmentation_labels",
    y="cnv_score",
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xlabel("SpatialDE2 cluster")
plt.ylabel("CNV score")

plt.tight_layout()

plt.savefig(
    "suppfig_12A_boxplot_B4_cnv_score_spatialDE.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()

In [None]:
# Assign spatial clusters 0 and 1 from spatialDE2 as ecDNA enriched
b4.obs["ecDNA_enrichment"] = b4.obs["segmentation_labels"].isin([0, 2])

In [None]:
cellt = [
    "CDH12_Epithelial",
    "Cycling_Epithelial",
    "KRT6A_Epithelial",
    "KRT13_Epithelial",
    "UPK_Epithelial",
    "ACTA2_Fibroblast",
    "FAP_Fibroblast",
    "PDGFRB_Fibroblast",
    "PDPN_Fibroblast",
    "Endothelial",
    "Inflam_Macrophage",
    "MHCII_Macrophage",
    "Dendritic_cell",
    "CD20_Bcell",
    "Plasma_Bcell",
    "CD8T",
    "Naive_Tcell",
    "Treg",
    "Normal_Smooth_muscle",
]

In [None]:
df = b4.obs[cellt]
df = df.div(df.sum(axis=1), axis=0)
df.rename(columns=cellt_newnames, inplace=True)

In [None]:
df = df.join(b4.obs[["ecDNA_enrichment", "segmentation_labels"]]).melt(
    id_vars=["ecDNA_enrichment", "segmentation_labels"]
)
df

In [None]:
# let's only compare clusters 0 and 2
df_sub = df[df["segmentation_labels"].isin([0, 2])]
df_sub["segmentation_labels"] = pd.Categorical(
    df_sub["segmentation_labels"], categories=[0, 2]
)
df_sub["variable"] = df_sub["variable"].str.replace("_", " ")
df_sub

In [None]:
sns.boxplot(
    df_sub,
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=[sns.color_palette("colorblind")[0], sns.color_palette("colorblind")[2]],
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of cell type")
plt.legend(title="SpatialDE cluster")

plt.savefig(
    "boxplot_celltypes_spatialde_segments_cl0_cl2.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()

In [None]:
df_sub.groupby("variable").apply(
    lambda x: mannwhitneyu(
        x.value[x.segmentation_labels == 0],
        x.value[x.segmentation_labels == 2],
        alternative="two-sided",
    ).pvalue
).rename("pval").reset_index().assign(
    padj=lambda x: multipletests(x.pval, method="fdr_bh")[1]
).sort_values(
    by="padj"
)

Let's focus just on epithelial cells, since stromal and immune cells are almost not present.

In [None]:
plt.figure(figsize=(6, 6))
sns.boxplot(
    df_sub[
        (df_sub["variable"].str.contains("epithelial"))
        & (df_sub["variable"] != "Immune-inflitrated epithelial")
    ],
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=[sns.color_palette("colorblind")[0], sns.color_palette("colorblind")[2]],
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of epithelial subtype")
plt.legend(title="SpatialDE cluster")

plt.savefig(
    "fig4L_boxplot_epithelial_spatialde_segments_cl0_cl2.svg",
    dpi=300,
    bbox_inches="tight",
)
plt.show()
plt.close()

In [None]:
df_sub[df_sub["variable"].str.contains("epithelial")].groupby("variable").apply(
    lambda x: mannwhitneyu(
        x.value[x.segmentation_labels == 0],
        x.value[x.segmentation_labels == 2],
        alternative="two-sided",
    ).pvalue
).rename("pval").reset_index().assign(
    padj=lambda x: multipletests(x.pval, method="fdr_bh")[1]
).sort_values(
    by="padj"
)

In [None]:
b4_sub = b4[b4.obs["segmentation_labels"].isin([0, 2]), :]

Now let's check which genes are differentially expressed between these two regions.

In [None]:
b4_sub.obs["segmentation_labels"] = b4_sub.obs["segmentation_labels"].astype(str)
sc.tl.rank_genes_groups(b4_sub, "segmentation_labels", method="wilcoxon")

In [None]:
df = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["logfoldchanges"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
df.columns = ["logFC"]
g = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["pvals_adj"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
f = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["scores"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
df = df.join(g)
df.columns = ["logFC", "padj"]
df = df.join(f)
df.columns = ["logFC", "padj", "scores"]
df

In [None]:
df[(df["padj"] < 0.05) & (abs(df["logFC"]) >= 1.5)]

In [None]:
sns.set_theme(style="white", rc=PLOTTING_PARAMS)
df[df["logFC"] > 6] = 6
df[df["logFC"] < -6] = -6
plt.figure(figsize=(10, 8))
plt.scatter(x=df["logFC"], y=df["padj"].apply(lambda x: -np.log10(x)), s=1)

# highlight down- or up- regulated genes
down = df[(df["logFC"] <= -1.5) & (df["padj"] <= 0.001)]
up = df[(df["logFC"] >= 1.5) & (df["padj"] <= 0.001)]

plt.scatter(
    x=down["logFC"],
    y=down["padj"].apply(lambda x: -np.log10(x)),
    s=3,
    label="Down-regulated",
    color="blue",
)
plt.scatter(
    x=up["logFC"],
    y=up["padj"].apply(lambda x: -np.log10(x)),
    s=3,
    label="Up-regulated",
    color="red",
)
texts = []
for i, r in df[:20].iterrows():
    if r["logFC"] >= 1.5:
        texts.append(plt.text(x=r["logFC"], y=-np.log10(r["padj"]), s=i, size=16))
for i, r in df[-52:].iterrows():
    if r["logFC"] <= -1.5:
        texts.append(plt.text(x=r["logFC"], y=-np.log10(r["padj"]), s=i, size=16))

adjust_text(
    texts,
    arrowprops=dict(arrowstyle="-", color="black", lw=0.5),
    expand_points=(1.4, 1.8),
    expand_text=(1.3, 1.6),
    force_points=0.2,
    force_text=0.3,
    lim=300,
    only_move={"points": "y", "texts": "y"},
)
plt.xlabel("logFC")
plt.ylabel("-log10(FDR)")
plt.xlim(-5, 5)
plt.axvline(-1.5, color="grey", linestyle="--")
plt.axvline(1.5, color="grey", linestyle="--")
plt.axhline(3, color="grey", linestyle="--")
plt.savefig("fig4J_volcano_cl0_vs_cl2.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
rnk = pd.DataFrame(
    b4_sub.uns["rank_genes_groups"]["scores"],
    index=pd.DataFrame(b4_sub.uns["rank_genes_groups"]["names"])["0"],
)[["0"]]
rnk.columns = [1]

In [None]:
pre_res = gp.prerank(
    rnk=rnk,
    gene_sets=Msigdb.get_gmt(),
    threads=4,
    min_size=5,
    max_size=1000,
    permutation_num=1000,
    outdir=None,
    seed=6,
    verbose=True,
)

In [None]:
pre_res.res2d.to_csv("../../data/GSEA_GO_cl0_cl2.csv")

In [None]:
pre_res.res2d = pre_res.res2d[pre_res.res2d["FDR q-val"] < 0.05]

In [None]:
pre_res.res2d["Term"] = (
    pre_res.res2d["Term"].str.replace("HALLMARK_", "").str.replace("_", " ")
)

In [None]:
sns.reset_defaults()
ax = dotplot(
    pre_res.res2d,
    column="FDR q-val",
    cmap=plt.cm.viridis,
    size=6,
    figsize=(6, 6),
    cutoff=0.25,
    show_ring=False,
)
plt.show()
plt.close()

In [None]:
toplot = pre_res.res2d[:10].sort_values("NES").reset_index()

In [None]:
toplot["significant"] = toplot["FDR q-val"] < 0.05
colors = toplot["significant"].map({True: "#336E23", False: "#D59F3F"})

In [None]:
sns.reset_defaults()
plt.figure(figsize=(6, 6))
plt.barh(toplot["Term"], toplot["NES"], color=colors)
plt.axvline(0, color="black", linewidth=0.8, linestyle="dashed")
plt.xlabel(
    "Cluster 2 <-- NES --> Cluster 0",
    size=16,
)
plt.yticks(size=16)
plt.xticks(size=16)
plt.title("GSEA: Cluster 0 versus cluster 2", size=16)

# Custom legend
legend_elements = [
    Patch(facecolor="#D59F3F", label="False"),
    Patch(facecolor="#336E23", label="True"),
]
plt.legend(
    handles=legend_elements,
    title="FDR q-value\n< 0.05",
    loc="best",
    fontsize=12,
    title_fontsize=14,
)

# plt.tight_layout()
plt.savefig("fig4K_barplot_cl0_vs_cl2.svg", dpi=300, bbox_inches="tight")
plt.savefig("fig4K_barplot_cl0_vs_cl2.png", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

## General sample-specific signatures

In [None]:
adata.obs["sample"] = pd.Categorical(
    adata.obs["sample"],
    categories=[
        "B22",
        "B24",
        "B60",
        "B154",
        "B156",
        "B175",
        "B178",
        "B4",
        "B42",
        "B123",
    ],
    ordered=True,
)
expressions = sc.get.obs_df(adata, keys=["sample", "APOBEC3B", "MDM2", "RAP1B"])

In [None]:
data = [
    expressions.loc[ids, "APOBEC3B"].values
    for ids in expressions.groupby("sample").groups.values()
]

In [None]:
H, p = kruskal(*data)
p

In [None]:
sp.posthoc_dunn(expressions, val_col="APOBEC3B", group_col="sample", p_adjust="fdr_bh")

In [None]:
x = "sample"
y = "APOBEC3B"
order = expressions["sample"].unique().categories
pairs = [("B4", s) for s in expressions["sample"].unique() if s != "B4"]

ax = sns.boxplot(expressions, x="sample", y=y)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xlabel("Sample")
plt.ylabel(f"Normalized APOBEC3B\nexpression")
plt.xticks(rotation=45)


annotator = Annotator(ax, pairs, data=expressions, x=x, y=y, order=order)
annotator.configure(
    test="Mann-Whitney", text_format="star", loc="inside", comparisons_correction="BH"
)
annotator.apply_and_annotate()

plt.ylim(0, 5)


plt.savefig(
    "suppfigure_12E_boxplot_APOBEC3B_signature_cohort.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()