# Analysis of PDO-21 and PDO-27 datasets for Figure 6 and SF6

This notebook generates figures for Figure 6 and SF6. The processed anndata is loaded and MELD analysis performed on PDO derived epithelial cells from PDO-21 and PDO-27. proCSC and revCSC gene module scores based on known gene signatures are then calculated for the PDO-21 and PDO-27. The anndata inputs and outputs of this notebook can be found at this zenodo link: https://zenodo.org/records/8177571

## Load packages and sc plotting settings

In [None]:
# Load packages
import scanpy as sc
import matplotlib as mpl
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.colors import to_rgba
import pandas as pd
from copy import copy
import numpy as np
import sklearn as sk
import scprep
import phate
import scrublet
import random
import anndata as ad
import meld
import scipy.stats as stats
from matplotlib.colors import to_rgba
from pygam import LinearGAM, s


In [None]:
# Setup the global plotting parameters
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.set_figure_params(dpi=100, color_map = "viridis", frameon=True, transparent=True,
                    dpi_save=800, facecolor="None", format="pdf", figsize=[4,4])

# Figure output directory
sc.settings.figdir = 'figures/Fig_6/'

# Set seed for reproducibility
np.random.seed(12)

In [None]:
# Condition colours
coculture_cmap = {
    'Coculture_1': '#F9CFCE', 
    'Coculture_2': '#EF9487', 
    'Coculture_3': '#EB716B', 
    'PDO_1': '#B2D1AF',
    'PDO_2': '#7CC06F',
    'PDO_3': '#00A300'
}

cell_type_cmap = {
    'pdo_27_PDO': '#920000',
    'pdo_21_PDO': '#5dcc00',
    'pdo_21_Fibroblast': '#A954CE',
    'pdo_27_Fibroblast': '#412663'
}

condition_cmap = {
    'Coculture': '#EB716B',
    'PDO': '#00A300', 
    'Fibroblast' : '#0C01F5'
}


## Load processed anndata object

In [None]:
# Import data
input_file = '../00_scRNA_seq_preprocessing/merged_filtered_trellis_adata_v4.h5ad'  # h5ad file exported from preprocessing pipeline

split_adata = sc.read_h5ad(input_file)

# Inspect data shape
split_adata

## PDO only analysis

In [None]:
split_adata.obs['pdo_cell_type'] = split_adata.obs['dataset'].astype(str) + '_' + split_adata.obs['cell_type'].astype(str)

In [None]:
# Extract only PDO
pdo_adata =  split_adata[split_adata.obs['cell_type'].isin(['PDO'])]

# PCA
sc.tl.pca(pdo_adata, n_comps=100, svd_solver='arpack')

In [None]:
# Insert replicate value
pdo_adata.obs['replicate'] = pdo_adata.obs['sample_id'].str[-1]
pdo_adata.obs['condition_replicate'] = pdo_adata.obs['condition'].astype(str) + '_' + pdo_adata.obs['replicate'].astype(str)

## PDO21 co-culture perturbation analysis

In [None]:
# Extract only PDO
pdo21_adata =  pdo_adata[pdo_adata.obs['dataset'].isin(['pdo_21'])]

# Exctract data
metadata_21 = pdo21_adata.obs
data_21_df = pdo21_adata.to_df()

In [None]:
# Run Phate on pdo21 only
sc.external.tl.phate(pdo21_adata, random_state=12)

In [None]:
# Figure 6a 1
sc.external.pl.phate(pdo21_adata, color=['condition_replicate'],  
frameon=False, add_outline=True, palette=coculture_cmap, save="Fig_6_A_left_pdo21.pdf")

In [None]:
# Run Meld
meld_op_21 = meld.MELD()
sample_densities_21 = meld_op_21.fit_transform(data_21_df, sample_labels=metadata_21['condition_replicate'])
data_phate_21 = pdo21_adata.obsm['X_phate']

In [None]:
#  Helper function to apply L1 normalization across the densities for each replicate
def replicate_normalize_densities(sample_densities, replicate):
    # Get the unique replicates
    replicates = np.unique(replicate)
    sample_likelihoods = sample_densities.copy()
    for rep in replicates:
        # Select the columns of `sample_densities` for that replicate
        curr_cols = sample_densities.columns[[col.endswith(rep) for col in sample_densities.columns]]
        curr_densities = sample_densities[curr_cols]
        # Apply L1 normalization
        sample_likelihoods[curr_cols] = sk.preprocessing.normalize(curr_densities, norm='l1')
    return sample_likelihoods

In [None]:
sample_likelihoods = replicate_normalize_densities(sample_densities_21, metadata_21['replicate'])

In [None]:
# Create the cross sample embedding 
fig, axes = plt.subplots(1,2, figsize=(8.7,4))

experimental_samples = ['Coculture_1', 'Coculture_2',	'Coculture_3']

scprep.plot.scatter2d(data_phate_21, c=sample_likelihoods[experimental_samples].mean(axis=1), 
                      cmap=meld.get_meld_cmap(), vmin=0, vmax=1,
                      title='Mean', ticks=False, ax=axes[0])

scprep.plot.scatter2d(data_phate_21, c=sample_likelihoods[experimental_samples].std(axis=1), vmin=0, 
                      cmap='inferno', title='St. Dev.', ticks=False, ax=axes[1])

fig.tight_layout()

In [None]:
# Assign the likelhood of peturbation by CAFs to the adata object
metadata_21['coculture_likelihood'] = sample_likelihoods[experimental_samples].mean(axis=1).values
pdo21_adata.obs['coculture_likelihood'] = sample_likelihoods[experimental_samples].mean(axis=1).values

In [None]:
pdo21_adata.obs['coculture_likelihood'] = sample_likelihoods[experimental_samples].mean(axis=1).values

In [None]:
# View the Phate DR with MELD score
sc.external.pl.phate(pdo21_adata, color=['coculture_likelihood'],  
frameon=False, add_outline=True, title='', vmin=0, vmax=1, color_map=meld.get_meld_cmap(), save="Fig6_A_middle_pdo21.pdf")

In [None]:
metadata_21 = pdo21_adata.obs

# Condition colours
coculture_cmap_sort = {
     '5': '#EB716B', 
     '4': '#EF9487', 
     '3': '#F9CFCE', 
     '2': '#00A300',
     '1': '#7CC06F',
     '0': '#B2D1AF'
}

cluster2order = {
     'PDO_1': '0',
     'PDO_2': '1',
     'PDO_3': '2',
     'Coculture_1': '3',
     'Coculture_2': '4',
     'Coculture_3': '5'
}

# add a new `.obs` column by mapping between annotations
metadata_21['condition_replicate_sort'] = metadata_21['condition_replicate'].map(cluster2order).astype('category')

metadata_21['condition_replicate_sort'] = metadata_21['condition_replicate_sort'].cat.reorder_categories(['0', '1', '2', '3', '4', '5'])

In [None]:
# Plot jitter plots of MELD densities
fig, ax = plt.subplots(1, figsize=(10,10))

g = scprep.plot.jitter(metadata_21['condition_replicate_sort'], pdo21_adata.obs['coculture_likelihood'], 
                   c=metadata_21['condition_replicate_sort'], 
                   cmap=coculture_cmap_sort,
                   legend=False, 
                   plot_means=True, 
                   means_s=50, 
                   xlabel=False, 
                   ylabel='Mean Coculture likelihood',
                   ax=ax)

ax.set_xticklabels(pd.unique(metadata_21.sort_values('condition_replicate_sort', ascending=True)['condition_replicate']), rotation=90)
ax.set_ylim(0,1)

fig.tight_layout()

plt.savefig("figures/Fig_6_A_meld_pdo21_right.pdf")

In [None]:
# Leiden clustering on PDO-21 cells for downstream VR landscape analysis
sc.pp.neighbors(pdo21_adata, random_state=12, n_pcs=30, n_neighbors=100)
sc.tl.leiden(pdo21_adata, resolution = 0.2, random_state=12, key_added="leidenr0.2")

In [None]:
metadata_21 = pdo21_adata.obs

# Bin cells based on MELD score
perturbed = np.zeros(len(metadata_21), dtype=str)  # Initialize an array to hold the categorical values

perturbed[metadata_21['coculture_likelihood'] > 0.6] = "2"
perturbed[(metadata_21['coculture_likelihood'] <= 0.6) & (metadata_21['coculture_likelihood'] > 0.4)] = "1"
perturbed[metadata_21['coculture_likelihood'] <= 0.4] = "0"

# Assign the categorical values to a new categorical metadata variable
pdo21_adata.obs['perturbed'] = perturbed

In [None]:
# Generate pdo21 specific cell - gene qc values
sc.pp.calculate_qc_metrics(pdo21_adata, percent_top=None, log1p=False, use_raw=True, inplace=True)

In [None]:
# Load proCSC and revCSC gene signatures
gene_signatures = pd.read_csv("data/CuratedEpithelia_pro_rev_geneSet_sigs.csv")
gene_signatures.groupby("ANNOTATION").count()
stem_signatures = gene_signatures

In [None]:
# WNT and YAP published gene signatures
g_WntMORRAL20 = pd.Series(["Ascl2", "Axin2", "Lgr5", "Sp5", "Cachd1", "Smoc2", "Fam216a", 
    "Lef1", "Cyp4x1", "Slc12a2", "Dpep1", "Nkd1", "Lrp4", "Znrf3", "Ptpro", 
    "Apcdd1", "Tnfrsf19", "Sox2", "Ptch1", "Tspan5", "Myrip", "Cdk6", "Itga9", 
    "Ppp2r2c", "Chil1", "Fry", "Cxcr4", "Slc28a3", "Sacs", "Has2", "Sp8", 
    "Cdca7", "Ets2", "Sox4", "Vsnl1", "Cadm1", "Slc7a8", "Zfp704", "Fam43a", 
    "Prox1", "Nrxn3","Fgf18","9530003J23Rik","Bambi","Amacr","Lrig1","Nav2",
    "Kremen1","Kcnj2","Slc16a10","Prss23","Rgmb","Rtkn2","Frem2","Oxgr1",
    "Abcc4","Mme","Kcnj8","Asb4"])
g_WntHAN20 = pd.Series(["Axin2", "Apcdd1", "Nkd1", "Tnfrsf19", "Smoc2", "Cd44", "Wnt6"])
g_WntReceptQINCARDOSO23 = pd.Series(["Fzd1", "Fzd2", "Fzd3", "Fzd4", "Fzd5", "Fzd6", 
    "Fzd7", "Fzd8", "Fzd9", "Fzd10", "Lrp5", "Lrp6", "Ror1", "Ror2", "Musk",
    "Ryk", "Ptk7"])

g_YapALVAREZ22 = pd.Series(["Anxa1", "Ccn1", "Ccn2", "Fjx1", "Axl", "Ereg", "Wwc2"])
g_YapHAN20 = pd.Series(["Ankrd1", "Ccn1", "Ccn2", "Axl", "Msln", "Plaur"])
g_YapWANG18 = pd.Series(["Amotl2", "Ankrd1", "Igfbp3", "F3", "Fjx1", "Nuak2", "Gm49361",
    "Crim1", "Gadd45a", "Tgfb2", "Ptpn14", "Nt5e", "Foxf2", "Axl", "Dock5",
    "Asap1", "Rbms3", "Myof", "Arhgef17", "Ccdc80"])


In [None]:
sign_signatures = pd.concat(
    [g_WntMORRAL20,g_WntHAN20,g_WntReceptQINCARDOSO23,g_YapALVAREZ22,
    g_YapHAN20,g_YapWANG18], 
    keys=["g_WntMORRAL20","g_WntHAN20","g_WntReceptQINCARDOSO23","g_YapALVAREZ22",
        "g_YapHAN20","g_YapWANG18"]
    ).reset_index(level=[0],name="GENE").rename(columns={"level_0": "ORIGIN"})
sign_signatures

In [None]:
from biomart import BiomartServer

mart = BiomartServer("http://www.ensembl.org/biomart").datasets["mmusculus_gene_ensembl"]
# mart.show_attributes()
# for i in mart.show_attributes_by_page():
#     print(i)

response = mart.search({
    "filters": {
        "mgi_symbol": stem_signatures["GENE"].to_list()+sign_signatures["GENE"].to_list()
    },
    "attributes":[
        "external_gene_name",
        "hsapiens_homolog_associated_gene_name"
        ]
    })

# Preserve human uppser casing
homology_dict = {}
for row in response.iter_lines():
    row = row.decode('utf-8').split("\t")
    homology_dict[row[0]] = row[1]

homology_dict

In [None]:
# Filter gene signatures by genes expressed
stem_signatures = stem_signatures.replace({"GENE":homology_dict}).replace("", np.nan).dropna()

In [None]:
sign_signatures = sign_signatures.replace({"GENE":homology_dict}).replace("", np.nan).dropna()

In [None]:
# Filter signatures based on expression
sign_signatures_filter = sign_signatures[sign_signatures["GENE"].isin(pdo21_adata.var_names[pdo21_adata.var.n_cells_by_counts>320])]

In [None]:
# Filter signature genes by expression 
stem_signatures_filter = stem_signatures[stem_signatures['GENE'].isin(pdo21_adata.var_names[pdo21_adata.var.n_cells_by_counts>320])]

In [None]:
# Load leedham reCSC signature
leedham_df = pd.read_csv("data/leedham_sig.csv")

# Exctract the gene sig
leedham_sig = leedham_df['hsapiens_homolog_associated_gene_name']

# Remove the nans
leedham_sig = leedham_sig.dropna()

In [None]:
# Filter based on expression
leedham_sig_filter = leedham_sig[leedham_sig.isin(pdo21_adata.var_names[pdo21_adata.var.n_cells_by_counts>320])]

In [None]:
# Load pelka proCSC and endo signature
pelka_sig_df = pd.read_csv("data/pelka_sig.csv")

# Filter based on expression
proCSC_Pelka21 = pelka_sig_df.loc[pelka_sig_df["ANNOTATION"]=="pelka21_proSC","GENE"]
proCSC_Pelka21_filter = proCSC_Pelka21[proCSC_Pelka21.isin(pdo21_adata.var_names[pdo21_adata.var.n_cells_by_counts>320])]

# Filter based on expression
enteroendocrine_Pelka21 = pelka_sig_df.loc[pelka_sig_df["ANNOTATION"]=="pelka21_enteroendocrine","GENE"]
enteroendocrine_Pelka21_filter = enteroendocrine_Pelka21[enteroendocrine_Pelka21.isin(pdo21_adata.var_names[pdo21_adata.var.n_cells_by_counts>50])]


In [None]:
# Convert to list format for scoring
proCSCsign_sig = stem_signatures_filter.loc[stem_signatures_filter["ANNOTATION"]=="proCSC_sig","GENE"].to_list()
proCSC_Pelka21 = proCSC_Pelka21_filter
revCSCsign_sig = stem_signatures_filter.loc[stem_signatures_filter["ANNOTATION"]=="revCSC_sig","GENE"].to_list()
revCSC_Leedham22 = leedham_sig_filter.to_list()

enteroendocrine_Pelka21 = enteroendocrine_Pelka21_filter

WntHAN20sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_WntHAN20","GENE"].to_list()
WntMORRAL20sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_WntMORRAL20","GENE"].to_list()
WntReceptQINCARDOSO23sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_WntReceptQINCARDOSO23","GENE"].to_list()
YapALVAREZ22sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_YapALVAREZ22","GENE"].to_list()
YapHAN20sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_YapHAN20","GENE"].to_list()
YapWANG18sign = sign_signatures_filter.loc[sign_signatures_filter["ORIGIN"]=="g_YapWANG18","GENE"].to_list()

In [None]:
# Score genes for gene signatures
sc.tl.score_genes(pdo21_adata, gene_list=proCSCsign_sig, ctrl_size=25, score_name="proCSCsign_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=proCSC_Pelka21, ctrl_size=25, score_name="proCSC_Pelka21_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=revCSCsign_sig, ctrl_size=25, score_name="revCSCsign_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=revCSC_Leedham22, ctrl_size=25, score_name="revLeedham22sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=enteroendocrine_Pelka21, ctrl_size=25, score_name="EntPelka21sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=WntHAN20sign, ctrl_size=25, score_name="WntHAN20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=WntMORRAL20sign, ctrl_size=25, score_name="WntMORRAL20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=WntReceptQINCARDOSO23sign, ctrl_size=25, score_name="WntReceptQINCARDOSO23sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=YapALVAREZ22sign, ctrl_size=25, score_name="YapALVAREZ22sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=YapHAN20sign, ctrl_size=25, score_name="YapHAN20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo21_adata, gene_list=YapWANG18sign, ctrl_size=25, score_name="YapWANG18sign", random_state=12, use_raw=True)

In [None]:
pdo_21_metadata_df = pdo21_adata.obs

# Create perturbed_condition
pdo_21_metadata_df["replicate_perturbed"] = pdo_21_metadata_df["perturbed"].astype(str) + "_" + pdo_21_metadata_df["replicate"].astype(str) 

In [None]:
cmap_mr = sns.diverging_palette(240, 14, n=99)

sc.pl.matrixplot(pdo21_adata, ['proCSCsign_sig', 'proCSC_Pelka21_sig', 'WntReceptQINCARDOSO23sign', 'WntHAN20sign', 'WntMORRAL20sign', 'revCSCsign_sig', 'revLeedham22sign', 'YapWANG18sign', 'YapALVAREZ22sign', 'YapHAN20sign'],
 groupby='perturbed', dendrogram=False, standard_scale='var', cmap='coolwarm', save="Fig6_E_heatmap")

In [None]:
# Generate the conditon replicate
# Extract means
mean_values = pdo_21_metadata_df.groupby(['condition_replicate'])['revCSCsign_sig'].mean()

# Create a DataFrame from the mean_values array
means_df = pd.DataFrame(mean_values).reset_index()

# Define the desired order of condition_replicate values
order = ['PDO_1', 'PDO_2', 'PDO_3', 'Coculture_1', 'Coculture_2', 'Coculture_3']

means_df['condition_replicate'] = pd.Categorical(means_df['condition_replicate'], categories=order, ordered=True)
means_df.sort_values(by='condition_replicate', inplace=True)

# Reset the index of the DataFrame
means_df.reset_index(drop=True, inplace=True)

means_df['condition'] = ['PDO', 'PDO', 'PDO', 'Coculture', 'Coculture', 'Coculture']

# Create a violin plot
plt.figure(figsize=(8, 6))

# Create violinplot and boxplot using seaborn
plt.figure(figsize=(3, 6))
sns.violinplot(data=pdo_21_metadata_df, x='condition_replicate', y='revCSCsign_sig', inner=None, color='white', alpha=0.3, linewidth=1.5, palette=coculture_cmap,
               order=order)

plt.scatter(x=range(len(mean_values)), y=means_df['revCSCsign_sig'],c="k",s=5)

# Label axes
plt.xticks(rotation=90)
plt.xlabel('Condition')
plt.ylabel('revCSC_sig')
plt.title('revCSC_sig score by Condition')

# Remove horizontal gridlines
plt.gca().yaxis.grid(False)

# save
plt.savefig("figures/Fig6_D_right.pdf", bbox_inches='tight')


In [None]:
# Generate the conditon replicate
# Extract means
mean_values = pdo_21_metadata_df.groupby(['condition_replicate'])['proCSCsign_sig'].mean()

# Create a DataFrame from the mean_values array
means_df = pd.DataFrame(mean_values).reset_index()

# Define the desired order of condition_replicate values
order = ['PDO_1', 'PDO_2', 'PDO_3', 'Coculture_1', 'Coculture_2', 'Coculture_3']

means_df['condition_replicate'] = pd.Categorical(means_df['condition_replicate'], categories=order, ordered=True)
means_df.sort_values(by='condition_replicate', inplace=True)

# Reset the index of the DataFrame
means_df.reset_index(drop=True, inplace=True)

means_df['condition'] = ['PDO', 'PDO', 'PDO', 'Coculture', 'Coculture', 'Coculture']

# Create a violin plot
plt.figure(figsize=(8, 6))

# Create violinplot and boxplot using seaborn
plt.figure(figsize=(3, 6))
sns.violinplot(data=pdo_21_metadata_df, x='condition_replicate', y='proCSCsign_sig', inner=None, color='white', alpha=0.3, linewidth=1.5, palette=coculture_cmap,
               order=order)

plt.scatter(x=range(len(mean_values)), y=means_df['proCSCsign_sig'],c="k",s=5)

# Label axes
plt.xticks(rotation=90)
plt.xlabel('Meld designation')
plt.ylabel('proCSCsign_sig')
plt.title('proCSCsign_sig score by MELD')

# Remove horizontal gridlines
plt.gca().yaxis.grid(False)

# save
plt.savefig("figures/Fig6_C_right.pdf.pdf", bbox_inches='tight')


In [None]:
# revCSC
# Generate labels
labels = pdo21_adata.obs['condition_replicate']

# Load or create your data as a Pandas DataFrame (replace with your actual data)
data = pd.DataFrame({'X': pdo21_adata.obs['coculture_likelihood'], 
                     'Y': pdo21_adata.obs['revCSCsign_sig'],
                     'replicate': pdo21_adata.obs['replicate']
})


# Create a list to store the GAM models for each replicate
gam_models = []

# Plotting settings
mean_color = 'black'  # Color for the mean line

# Sort the data based on 'X' values
sorted_data = data.sort_values(by='X')

# Initialize plot
mpl.style.use('default')
fig, ax = plt.subplots()
ax.scatter(data['X'], data['Y'], s=0.25, c=labels.map(coculture_cmap))

# Iterate over the unique categories in 'replicate'
for i, replicate in enumerate(sorted_data['replicate'].unique()):
    # Subset the data for the current replicate
    subset_data = sorted_data[sorted_data['replicate'] == replicate]

    # Extract the X and Y values for the current replicate
    X = subset_data['X'].values.reshape(-1, 1)
    Y = subset_data['Y'].values

    # Create the GAM model for the current replicate
    gam = LinearGAM(s(0, n_splines=10))  # Cubic spline with n10
    gam.fit(X, Y)

    # Store the GAM model for the current replicate
    gam_models.append(gam)

# Compute and plot the mean line
sample_df = pd.DataFrame(data=[gam.predict(X) for gam in gam_models]).T

# Calculate the mean
sample_df['mean'] = sample_df.iloc[:, 0:3].mean(axis=1)
# Calculate the standard deviation
sample_df['std'] = sample_df.iloc[:, 0:3].std(axis=1)
# Calculate a confidence interval as well.
sample_df['ci'] = 1.96 * sample_df['std'] / np.sqrt(3)
sample_df['ci_lower'] = sample_df['mean'] - sample_df['ci']
sample_df['ci_upper'] = sample_df['mean'] + sample_df['ci']

plt.plot(X, sample_df['mean'], color=mean_color, linewidth=1.75, label='Mean')
plt.plot(X, sample_df['ci_upper'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci upper')
plt.plot(X, sample_df['ci_lower'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci lower')

# Customize the plot
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('X')
plt.ylabel('Y')
plt.title('GAM Lines for RevCSC_SIG score')

plt.savefig("figures/Fig6_D_left_GAM_revCSC.pdf", bbox_inches="tight")

plt.show()

In [None]:
# proCSC signature
# Generate labels
labels = pdo21_adata.obs['condition_replicate']

# Load or create your data as a Pandas DataFrame (replace with your actual data)
data = pd.DataFrame({'X': pdo21_adata.obs['coculture_likelihood'], 
                     'Y': pdo21_adata.obs['proCSCsign_sig'],
                     'replicate': pdo21_adata.obs['replicate']
})


# Create a list to store the GAM models for each replicate
gam_models = []

# Plotting settings
mean_color = 'black'  # Color for the mean line

# Sort the data based on 'X' values
sorted_data = data.sort_values(by='X')

# Begin plot
mpl.style.use('default')
fig, ax = plt.subplots()
ax.scatter(data['X'], data['Y'], s=0.25, c=labels.map(coculture_cmap))

# Iterate over the unique categories in 'replicate'
for i, replicate in enumerate(sorted_data['replicate'].unique()):
    # Subset the data for the current replicate
    subset_data = sorted_data[sorted_data['replicate'] == replicate]

    # Extract the X and Y values for the current replicate
    X = subset_data['X'].values.reshape(-1, 1)
    Y = subset_data['Y'].values

    # Create the GAM model for the current replicate
    gam = LinearGAM(s(0, n_splines=10))  # Cubic spline with n10
    gam.fit(X, Y)

    # Store the GAM model for the current replicate
    gam_models.append(gam)

# Compute and plot the mean line
sample_df = pd.DataFrame(data=[gam.predict(X) for gam in gam_models]).T

# Calculate the mean
sample_df['mean'] = sample_df.iloc[:, 0:3].mean(axis=1)
# Calculate the standard deviation
sample_df['std'] = sample_df.iloc[:, 0:3].std(axis=1)
# Calculate a confidence interval as well.
sample_df['ci'] = 1.96 * sample_df['std'] / np.sqrt(3)
sample_df['ci_lower'] = sample_df['mean'] - sample_df['ci']
sample_df['ci_upper'] = sample_df['mean'] + sample_df['ci']

plt.plot(X, sample_df['mean'], color=mean_color, linewidth=1.75, label='Mean')
plt.plot(X, sample_df['ci_upper'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci upper')
plt.plot(X, sample_df['ci_lower'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci lower')
# Customize the plot
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('X')
plt.ylabel('Y')
plt.title('GAM Lines for proCSC_SIG score')

plt.savefig("figures/Fig6_E_left_GAM_proCSC.pdf", bbox_inches="tight")

plt.show()

In [None]:
# Pelka entero signature
# Generate labels
labels = pdo21_adata.obs['condition_replicate']

# Load or create your data as a Pandas DataFrame (replace with your actual data)
data = pd.DataFrame({'X': pdo21_adata.obs['coculture_likelihood'], 
                     'Y': pdo21_adata.obs['EntPelka21sign'],
                     'replicate': pdo21_adata.obs['replicate']
})


# Create a list to store the GAM models for each replicate
gam_models = []

# Plotting settings
mean_color = 'black'  # Color for the mean line

# Sort the data based on 'X' values
sorted_data = data.sort_values(by='X')

# Begin plot
mpl.style.use('default')
fig, ax = plt.subplots()
ax.scatter(data['X'], data['Y'], s=0.25, c=labels.map(coculture_cmap))

# Iterate over the unique categories in 'replicate'
for i, replicate in enumerate(sorted_data['replicate'].unique()):
    # Subset the data for the current replicate
    subset_data = sorted_data[sorted_data['replicate'] == replicate]

    # Extract the X and Y values for the current replicate
    X = subset_data['X'].values.reshape(-1, 1)
    Y = subset_data['Y'].values

    # Create the GAM model for the current replicate
    gam = LinearGAM(s(0, n_splines=10))  # Cubic spline with 500 equally sized bins for column 0 (X)
    gam.fit(X, Y)

    # Store the GAM model for the current replicate
    gam_models.append(gam)

# Compute and plot the mean line
sample_df = pd.DataFrame(data=[gam.predict(X) for gam in gam_models]).T

# Calculate the mean
sample_df['mean'] = sample_df.iloc[:, 0:3].mean(axis=1)
# Calculate the standard deviation
sample_df['std'] = sample_df.iloc[:, 0:3].std(axis=1)
# Calculate a confidence interval as well.
sample_df['ci'] = 1.96 * sample_df['std'] / np.sqrt(3)
sample_df['ci_lower'] = sample_df['mean'] - sample_df['ci']
sample_df['ci_upper'] = sample_df['mean'] + sample_df['ci']

plt.plot(X, sample_df['mean'], color=mean_color, linewidth=1.75, label='Mean')
plt.plot(X, sample_df['ci_upper'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci upper')
plt.plot(X, sample_df['ci_lower'], color=mean_color, linestyle="dotted", linewidth=1.5, label='0.95 ci lower')
# Customize the plot
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('MELD Likelehood')
plt.ylabel('Pelka Entero score')
plt.title('GAM Lines for Pelka21 DEG entero score')

plt.savefig("figures/FigS6_E_enteroendocrine.pdf", bbox_inches="tight")

plt.show()

In [None]:
# Write anndata object to file
pdo21_adata.write_h5ad(filename = 'pdo21_epi_adata_v4.h5ad')

## PDO27 co-culture perturbation analysis

In [None]:
# Extract only PDO-27 from PDO epithelial data
pdo27_adata =  pdo_adata[pdo_adata.obs['dataset'].isin(['pdo_27'])]

# Extract data
metadata_27 = pdo27_adata.obs
data_27_df = pdo27_adata.to_df()

In [None]:
# Reset sc plotting settings
sc.set_figure_params(dpi=100, color_map = "viridis", frameon=True, transparent=True,
                    dpi_save=800, facecolor="None", format="pdf", figsize=[4,4])

In [None]:
# Run phate dr
sc.external.tl.phate(pdo27_adata, t=7, random_state=12)

In [None]:
# View the RNA phate
sc.external.pl.phate(pdo27_adata, color=['condition_replicate'],  
frameon=False, add_outline=True, title='', palette=coculture_cmap, save="Fig6_B_left_pdo27.pdf")

In [None]:
# Meld
meld_op_27 = meld.MELD()
sample_densities_27 = meld_op_27.fit_transform(data_27_df, sample_labels=metadata_27['condition_replicate'])

In [None]:
data_phate_27 = pdo27_adata.obsm['X_phate']
sample_likelihoods_27 = replicate_normalize_densities(sample_densities_27, metadata_27['replicate'])

In [None]:
# Create the mean phate dr embedding 
fig, axes = plt.subplots(1,2, figsize=(8.7,4))

experimental_samples = ['Coculture_1', 'Coculture_2',	'Coculture_3']

scprep.plot.scatter2d(data_phate_27, c=sample_likelihoods_27[experimental_samples].mean(axis=1), 
                      cmap=meld.get_meld_cmap(), vmin=0, vmax=1,
                      title='Mean', ticks=False, ax=axes[0])

scprep.plot.scatter2d(data_phate_27, c=sample_likelihoods_27[experimental_samples].std(axis=1), vmin=0, 
                      cmap='inferno', title='St. Dev.', ticks=False, ax=axes[1])

fig.tight_layout()

In [None]:
# Assign the likelhood of coculture peturbation
pdo27_adata.obs['coculture_likelihood'] = sample_likelihoods_27[experimental_samples].mean(axis=1).values
metadata_27['Coculture_likelihood'] = sample_likelihoods_27[experimental_samples].mean(axis=1).values

In [None]:
# RNA phate with MELD scores
sc.external.pl.phate(pdo27_adata, color=['coculture_likelihood'],  
frameon=False, add_outline=True, vmin=0, vmax=1, title='', color_map=meld.get_meld_cmap(), save="Fig6_B_middle_pdo27")

In [None]:
metadata_27 = pdo27_adata.obs
metadata_27['condition_replicate_sort'] = metadata_27['condition_replicate'].map(cluster2order).astype('category')
metadata_27['condition_replicate_sort'] = metadata_27['condition_replicate_sort'].cat.reorder_categories(['0', '1', '2', '3', '4', '5'])

In [None]:
fig, ax = plt.subplots(1, figsize=(10,10))

g = scprep.plot.jitter(metadata_27['condition_replicate_sort'], pdo27_adata.obs['coculture_likelihood'], 
                   c=metadata_27['condition_replicate_sort'], 
                   cmap=coculture_cmap_sort,
                   legend=False, 
                   plot_means=True, 
                   means_s=50, 
                   xlabel=False, 
                   ylabel='Mean Coculture likelihood',
                   ax=ax)

ax.set_xticklabels(pd.unique(metadata_27.sort_values('condition_replicate_sort', ascending=True)['condition_replicate']), rotation=90)
ax.set_ylim(0,1)

fig.tight_layout()

plt.savefig("figures/Fig6_B_right_pdo27.pdf")

In [None]:
# Score genes for gene signatures
sc.tl.score_genes(pdo27_adata, gene_list=proCSCsign_sig,  ctrl_size=25, score_name="proCSCsign_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=proCSC_Pelka21,  ctrl_size=25, score_name="proCSC_Pelka21_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=revCSCsign_sig,  ctrl_size=25, score_name="revCSCsign_sig", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=revCSC_Leedham22,  ctrl_size=25, score_name="revLeedham22sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=enteroendocrine_Pelka21,  ctrl_size=25, score_name="EntPelka21sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=WntHAN20sign,  ctrl_size=25, score_name="WntHAN20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=WntMORRAL20sign,  ctrl_size=25, score_name="WntMORRAL20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=WntReceptQINCARDOSO23sign,  ctrl_size=25, score_name="WntReceptQINCARDOSO23sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=YapALVAREZ22sign,  ctrl_size=25, score_name="YapALVAREZ22sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=YapHAN20sign,  ctrl_size=25, score_name="YapHAN20sign", random_state=12, use_raw=True)
sc.tl.score_genes(pdo27_adata, gene_list=YapWANG18sign,  ctrl_size=25, score_name="YapWANG18sign", random_state=12, use_raw=True)

In [None]:
# Generate violin plots
mean_values = metadata_27.groupby(['condition_replicate'])['revCSCsign_sig'].mean()

# Create a DataFrame from the mean_values array
means_df = pd.DataFrame(mean_values).reset_index()

# Define the desired order of condition_replicate values
order = ['PDO_1', 'PDO_2', 'PDO_3', 'Coculture_1', 'Coculture_2', 'Coculture_3']

means_df['condition_replicate'] = pd.Categorical(means_df['condition_replicate'], categories=order, ordered=True)
means_df.sort_values(by='condition_replicate', inplace=True)

# Reset the index of the DataFrame
means_df.reset_index(drop=True, inplace=True)

means_df['condition'] = ['PDO', 'PDO', 'PDO', 'Coculture', 'Coculture', 'Coculture']

# Create a violin plot
plt.figure(figsize=(8, 6))

# Create violinplot and boxplot using seaborn
plt.figure(figsize=(3, 6))
sns.violinplot(data=metadata_27, x='condition_replicate', y='revCSCsign_sig', inner=None, color='white', alpha=0.3, linewidth=1.5, palette=coculture_cmap,
               order=order)

plt.scatter(x=range(len(mean_values)), y=means_df['revCSCsign_sig'],c="k",s=5)

# Label axes
plt.xticks(rotation=90)
plt.xlabel('Condition')
plt.ylabel('revCSC_sig')
plt.title('revCSC_CSC score by Condition')

# Remove horizontal gridlines
plt.gca().yaxis.grid(False)

# save
plt.savefig("figures/Fig_S6_H.pdf", bbox_inches='tight')

In [None]:
# Generate violin plots
mean_values = metadata_27.groupby(['condition_replicate'])['proCSCsign_sig'].mean()

# Create a DataFrame from the mean_values array
means_df = pd.DataFrame(mean_values).reset_index()

# Define the desired order of condition_replicate values
order = ['PDO_1', 'PDO_2', 'PDO_3', 'Coculture_1', 'Coculture_2', 'Coculture_3']

means_df['condition_replicate'] = pd.Categorical(means_df['condition_replicate'], categories=order, ordered=True)
means_df.sort_values(by='condition_replicate', inplace=True)

# Reset the index of the DataFrame
means_df.reset_index(drop=True, inplace=True)

means_df['condition'] = ['PDO', 'PDO', 'PDO', 'Coculture', 'Coculture', 'Coculture']

# Create a violin plot
plt.figure(figsize=(8, 6))

# Create violinplot and boxplot using seaborn
plt.figure(figsize=(3, 6))
sns.violinplot(data=metadata_27, x='condition_replicate', y='proCSCsign_sig', inner=None, color='white', alpha=0.3, linewidth=1.5, palette=coculture_cmap,
               order=order)

plt.scatter(x=range(len(mean_values)), y=means_df['proCSCsign_sig'],c="k",s=5)

# Label axes
plt.xticks(rotation=90)
plt.xlabel('Condition')
plt.ylabel('proCSCsign_sig')
plt.title('proCSCsign_sig score by Condition')

# Remove horizontal gridlines
plt.gca().yaxis.grid(False)

# save
plt.savefig("figures/Fig_S6_H.pdf", bbox_inches='tight')

In [None]:
# Write anndata object to file
pdo27_adata.write_h5ad(filename = 'pdo27_epi_adata_v4.h5ad')