# 5. Analysis of ecDNA regions for the sample B123

In [None]:
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scikit_posthocs as sp
import seaborn as sns
import statsmodels.api as sm
from helper_functions import select_slide
from plotting_settings import PLOTTING_PARAMS
from scipy.stats import fisher_exact, kruskal, mannwhitneyu, pearsonr
from sklearn.mixture import GaussianMixture as GMM
from statannotations.Annotator import Annotator
from statsmodels.stats.multitest import multipletests

warnings.filterwarnings("ignore")
sys.path.insert(1, "../../helper_functions")

In [None]:
adata = sc.read_h5ad("../../aucell_adata.h5ad")
adata.obs["sample"] = pd.Categorical(
    adata.obs["sample"],
    categories=[
        "B22",
        "B24",
        "B60",
        "B154",
        "B156",
        "B175",
        "B178",
        "B4",
        "B42",
        "B123",
    ],
    ordered=True,
)
b123 = select_slide(adata, "B123")

spatial = sc.read_h5ad("../../../spatial_transcriptomics/SpatialDE/h5ad/B123.h5ad")
b123.obs = b123.obs.join(spatial.obs[["segmentation_labels"]])

cnv_scores = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/CNV_scores.csv", index_col=0
)
b123.obs = b123.obs.join(cnv_scores)

In [None]:
data = [
    adata.obs.loc[ids, "B123"].values
    for ids in adata.obs.groupby("sample").groups.values()
]

In [None]:
H, p = kruskal(*data)
p

In [None]:
sp.posthoc_dunn(adata.obs, val_col="B123", group_col="sample", p_adjust="fdr_bh")

In [None]:
sns.set_theme(style="white", rc=PLOTTING_PARAMS)

x = "sample"
y = "B123"
order = adata.obs["sample"].unique().categories
pairs = [("B123", s) for s in adata.obs["sample"].unique() if s != "B123"]

ax = sns.boxplot(adata.obs, x="sample", y="B123")
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xlabel("Sample")
plt.ylabel("Enrichment of B123-specific\nsignature (AUC)")

annotator = Annotator(ax, pairs, data=adata.obs, x=x, y=y, order=order)
# only used to plot significance
annotator.configure(
    test="Mann-Whitney", text_format="star", loc="inside", comparisons_correction="BH"
)
annotator.apply_and_annotate()


plt.savefig(
    "suppfig_14A_boxplot_B123_signature_cohort.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()

In [None]:
clones = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/B123_filtered/B123_leiden_subclones.csv",
    index_col=0,
)
clones.index = clones.index + "_B123"
clones

In [None]:
df = pd.read_csv(
    "../../../spatial_transcriptomics/CNVs/copykat/B123_filtered_n15/B123_copykat_bins.csv",
    index_col=0,
)
df

In [None]:
gmm = GMM(n_components=3, max_iter=1000, random_state=10, covariance_type="full")
# fit GMM to the entire dataset of copykat smoothed expressions
gmm.fit(df.to_numpy().flatten().reshape(-1, 1))

In [None]:
mean = gmm.means_
covs = gmm.covariances_
weights = gmm.weights_

In [None]:
mean

In [None]:
# extract positions with ecDNA
# chr6: 20311786-20720740
# chr6: 20721496-20721519
# chr6: 20725749-20779105
# chr6: 20779106-21051598
# chr6: 22738563-22994841
# chr6: 23552546-23576485
# chr6: 21410549-21793884
# chr6: 24775266-24791722
# chr6: 22941824-23000763


ecDNA = df.loc[:, df.columns.str.startswith("chr6:")]
# capturing a bit less than entire chr1 amplicon, just to be sure
sub = ecDNA.loc[:, ecDNA.columns[260:290]]
sub

In [None]:
# just to be careful, we define amplification if log ratio is over the mean of the highest component (the one for gain)
# hard coded mean of the highest component
amplifications = sub[sub.mean(axis=1) > mean[2][0]].index
amplifications

In [None]:
amplifications = [f"{code}_B123" for code in amplifications]
b123.obs["amplification"] = b123.obs.index.isin(amplifications)

In [None]:
b123.obs = b123.obs.join(clones)
# removing spots that were classified as diploid with copykat
b123 = b123[b123.obs["subclones"] != "diploid", :]

In [None]:
sc.pl.spatial(
    b123,
    color=["B123", "amplification", "segmentation_labels"],
    size=1.5,
    vmax="p99",
    cmap="viridis",
)

In [None]:
# Regressing out CNV score, which is inversely correlated to ESTIMATE score
X = sm.add_constant(b123.obs["cnv_score"])
model = sm.OLS(b123.obs["B123"], X).fit()

residuals = model.resid

b123.obs["B123_regressed"] = residuals

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(6, 8), sharex=True)
sns.scatterplot(b123.obs, x="cnv_score", y="B123", ax=axes[0])
sns.despine(top=True, right=True, left=False, bottom=False)
sns.scatterplot(b123.obs, x="cnv_score", y="B123_regressed", ax=axes[1])
sns.despine(top=True, right=True, left=False, bottom=False)
axes[0].set_xlabel("CNV score")
axes[1].set_xlabel("CNV score")
axes[0].set_ylabel("Enrichment of B123\necDNA signature")
axes[1].set_ylabel("Regressed enrichment of\nB123 ecDNA signature")

plt.tight_layout()
plt.savefig("scatterplot_B123_ecDNA_signature.svg", dpi=300)
plt.show()
plt.close()

In [None]:
print(pearsonr(b123.obs["cnv_score"], b123.obs["B123"]))
print(pearsonr(b123.obs["cnv_score"], b123.obs["B123_regressed"]))

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(6, 8), sharex=True)
sns.boxplot(
    b123.obs,
    x="amplification",
    y="B123",
    ax=axes[0],
    hue="amplification",
    palette=["#949494", "#78D3D3"],
    legend=False,
)
sns.despine(top=True, right=True, left=False, bottom=False)
sns.boxplot(
    b123.obs,
    x="amplification",
    y="B123_regressed",
    ax=axes[1],
    hue="amplification",
    palette=["#949494", "#78D3D3"],
    legend=False,
)
sns.despine(top=True, right=True, left=False, bottom=False)
axes[0].set_xlabel("Amplification")
axes[1].set_xlabel("Amplification")
axes[0].set_ylabel("Enrichment of B123\necDNA signature")
axes[1].set_ylabel("Regressed enrichment of\nB123 ecDNA signature")

plt.tight_layout()
plt.savefig("boxplot_B123_ecDNA_signature_amplification.svg", dpi=300)
plt.show()
plt.close()

In [None]:
print(
    mannwhitneyu(
        b123.obs[b123.obs["amplification"] == True]["B123"],
        b123.obs[b123.obs["amplification"] == False]["B123"],
        alternative="greater",
    )
)
print(
    mannwhitneyu(
        b123.obs[b123.obs["amplification"] == True]["B123_regressed"],
        b123.obs[b123.obs["amplification"] == False]["B123_regressed"],
        alternative="greater",
    )
)

In [None]:
# let's split regressed B123 signature into high and low
b123.obs["B123_bin"] = pd.qcut(
    b123.obs["B123_regressed"], [0, 0.5, 1.0], labels=["low", "high"]
)
# annotate ecDNA status only if regressed B123 signature is high and amplification was detected for the genomic bins used above
b123.obs["ecDNA_status"] = (b123.obs["B123_bin"] == "high") & (
    b123.obs["amplification"] == True
)

In [None]:
b123.obs["ecDNA_status"] = np.where(
    b123.obs["ecDNA_status"] == True, "ecDNA-positive", "ecDNA-negative"
)

In [None]:
sc.pl.spatial(
    b123,
    color=["B123_regressed", "amplification", "B123_bin", "ecDNA_status"],
    title=[
        "B123 regressed",
        "Amplification",
        "Binarized B123 signature",
        "Final ecDNA status",
    ],
    palette=["#949494", "#78D3D3"],
    cmap="viridis",
    size=1.5,
    ncols=2,
    vmax="p99",
    show=False,
)
plt.savefig("spatial_B123sig_amplification.svg", dpi=300)
plt.show()
plt.close()

We see that two distinct regions are ecDNA positive. Because we cannot be sure if ecDNA positive and ecDNA negative regions have comparable tumor content, we will try to compare these two distinct regions.

In [None]:
# Segmentation labels from SpatialDE2
b123.obs["segmentation_labels"].value_counts()

In [None]:
# Removing clusters with less than 5 observations
data = (
    b123.obs.groupby("ecDNA_status")["segmentation_labels"].value_counts().reset_index()
)

results = []
all_true = (
    data[data["ecDNA_status"] == "ecDNA-positive"]
    .groupby("segmentation_labels")["count"]
    .sum()
)
all_false = (
    data[data["ecDNA_status"] == "ecDNA-negative"]
    .groupby("segmentation_labels")["count"]
    .sum()
)
total_true = data[data["ecDNA_status"] == "ecDNA-positive"]["count"].sum()
total_false = data[data["ecDNA_status"] == "ecDNA-negative"]["count"].sum()

for cl in data["segmentation_labels"].unique():
    a = all_true.get(cl, 0)
    b = all_false.get(cl, 0)
    c = total_true - a
    d = total_false - b
    table = [[a, b], [c, d]]
    oddsratio, pvalue = fisher_exact(table, alternative="greater")
    results.append({"segmentation_labels": cl, "pvalue": pvalue, "a": a, "b": b})

results_df = pd.DataFrame(results)
results_df["padj"] = multipletests(results_df["pvalue"], method="fdr_bh")[1]

results_df.sort_values("padj")

In [None]:
sns.set_theme(style="white", rc=PLOTTING_PARAMS)
pivoted_df = data.pivot(
    columns="ecDNA_status", index="segmentation_labels", values="count"
)
ax = pivoted_df.div(pivoted_df.sum(axis=1), axis=0).plot.bar(
    stacked=True, width=0.9, figsize=(6, 4), color=["#949494", "#78D3D3"]
)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
plt.xticks(rotation=0)
plt.xlabel("SpatialDE2 cluster")
plt.ylabel("Fraction of spots\nper cluster")
plt.legend(title="ecDNA status", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.savefig(f"suppfig_14B_barplot_ecdna_spatialde.svg", dpi=300)
plt.show()
plt.close()

In [None]:
sc.pl.spatial(
    b123,
    color=["segmentation_labels"],
    title=["SpatialDE2 cluster"],
    size=1.5,
    show=False,
    palette=sns.color_palette("colorblind"),
)
plt.savefig("suppfig_14C_spatial_spatialde2.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
cs = b123.uns["segmentation_labels_colors"][:-1] + ["#E0E4E4"]

In [None]:
sc.pl.spatial(
    b123,
    color=["segmentation_labels"],
    title=["SpatialDE2 cluster"],
    size=1.5,
    show=False,
    palette=cs,
)
plt.savefig("suppfig_14C_spatial_spatialde2.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
sc.pl.spatial(
    b123,
    color=["ecDNA_status"],
    title=["ecDNA"],
    size=1.5,
    show=False,
    palette=["#949494", "#78D3D3"],
)
plt.savefig("suppfig_14D_spatial_spatialde2_ecDNA.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

Although we still see enrichment of two subclones with ecDNA, ecDNA status just visually doesn't correspond as good to subclones as to spatialDE segments. So we will focus on comparison of the segments.

In [None]:
b123.obs["ecDNA_enrichment"] = b123.obs["segmentation_labels"].isin([3, 9, 10])

In [None]:
cellt = [
    "CDH12_Epithelial",
    "Cycling_Epithelial",
    "KRT6A_Epithelial",
    "KRT13_Epithelial",
    "UPK_Epithelial",
    "ACTA2_Fibroblast",
    "FAP_Fibroblast",
    "PDGFRB_Fibroblast",
    "PDPN_Fibroblast",
    "Endothelial",
    "Inflam_Macrophage",
    "MHCII_Macrophage",
    "Dendritic_cell",
    "CD20_Bcell",
    "Plasma_Bcell",
    "CD8T",
    "Naive_Tcell",
    "Treg",
    "Normal_Smooth_muscle",
]

In [None]:
cellt_newnames = {
    "CDH12_Epithelial": "Immune-inflitrated epithelial",
    "Cycling_Epithelial": "Cycling epithelial",
    "KRT6A_Epithelial": "Basal epithelial",
    "KRT13_Epithelial": "Luminal epithelial (KRT13+)",
    "UPK_Epithelial": "Luminal epithelial (UPK)",
    "ACTA2_Fibroblast": "mCAF",
    "FAP_Fibroblast": "iCAF (FAP+)",
    "PDGFRB_Fibroblast": "General CAF",
    "PDPN_Fibroblast": "iCAF (PDPN+)",
    "Endothelial": "Endothelial",
    "Inflam_Macrophage": "Inflammatory macrophage",
    "MHCII_Macrophage": "Antigen-presenting macrophage",
    "Dendritic_cell": "Dendritic cell",
    "CD20_Bcell": "B cell",
    "Plasma_Bcell": "Plasma cell",
    "CD8T": "CD8+ T cell",
    "Naive_Tcell": "Naive T cell",
    "Treg": "Regulatory T cell",
    "Normal_Smooth_muscle": "Normal smooth muscle",
}

In [None]:
# add celltype abundance
celltypes = sc.read_h5ad(
    "../../../spatial_transcriptomics/cell2location/visium_model_alpha20_N20_Gouin_muscle_merged/posteriors_adata.h5ad"
)
s = select_slide(celltypes, "B123")
b123.obs = b123.obs.join(s.obs[cellt])

del celltypes

In [None]:
df = b123.obs[cellt]
df = df.div(df.sum(axis=1), axis=0)

In [None]:
df.rename(columns=cellt_newnames, inplace=True)

In [None]:
df = df.join(b123.obs[["ecDNA_enrichment", "segmentation_labels"]]).melt(
    id_vars=["ecDNA_enrichment", "segmentation_labels"]
)
df

In [None]:
# let's only compare clusters 0 and 1
df_sub = df[df["segmentation_labels"].isin([3, 9, 10])]
df_sub["segmentation_labels"] = pd.Categorical(
    df_sub["segmentation_labels"], categories=[3, 9, 10]
)
df_sub["variable"] = df_sub["variable"].str.replace("_", " ")
df_sub

In [None]:
sns.boxplot(
    df_sub,
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=[cs[3], cs[9], cs[10]],
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of cell type")
plt.legend(title="SpatialDE cluster", loc=(0.7, 0.7))
plt.savefig("boxplot_celltypes_spatialde_segments.svg", dpi=300, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
sns.boxplot(
    df_sub[df_sub["variable"].str.contains("epithelial")],
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=[cs[3], cs[9], cs[10]],
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of cell type")
plt.legend(title="SpatialDE cluster", loc=(0.7, 0.7))
plt.savefig(
    "suppfig_14F_boxplot_epithelial_spatialde_segments.svg",
    dpi=300,
    bbox_inches="tight",
)
plt.show()
plt.close()

In [None]:
sns.boxplot(
    df_sub[~df_sub["variable"].str.contains("epithelial")],
    x="variable",
    y="value",
    hue="segmentation_labels",
    palette=[cs[3], cs[9], cs[10]],
)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=90)
plt.xlabel("Cell type")
plt.ylabel("Fraction of cell type")
plt.legend(title="SpatialDE cluster", loc=(0.7, 0.7))
plt.savefig(
    "suppfig_14G_boxplot_non_epithelial_spatialde_segments.svg",
    dpi=300,
    bbox_inches="tight",
)
plt.show()
plt.close()

In [None]:
b123_sub = b123[b123.obs["segmentation_labels"].isin([3, 9, 10]), :]
epi = b123_sub.obs[cellt]
# get fractional abundance per spot
epi = epi.div(epi.sum(axis=1), axis=0)
epi.columns = [f"{ct}_fraction" for ct in epi.columns]

Now let's check which genes are differentially expressed between these two regions.

In [None]:
b123_sub.obs["segmentation_labels"] = b123_sub.obs["segmentation_labels"].astype(str)
sc.tl.rank_genes_groups(b123_sub, "segmentation_labels", method="wilcoxon")

In [None]:
sns.reset_defaults()
sc.pl.rank_genes_groups_dotplot(
    b123_sub,
    var_group_rotation=0,
    values_to_plot="logfoldchanges",
    cmap="bwr",
    n_genes=7,
    show=False,
)


plt.savefig(
    "suppfig_14E_dotplot_deg_spatialde_segments.svg", dpi=300, bbox_inches="tight"
)
plt.show()
plt.close()