# RPM Basal Organoid and Allograft Analysis

## Ireland et al 2024 BioRxiv

### Extended Data Fig 4

In [None]:
#Import other relevant packages
import numpy as np
import pandas as pd
from matplotlib import rcParams
import os
import scanpy as sc

import matplotlib as mpl
import matplotlib.pyplot as plt

#For nice color schemes
import cmocean

#For barplots
import seaborn as sns

#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

#Import scVI
import scvi
from scvi.model.utils import mde

scvi.settings.verbosity = 40

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
# Read 
os.chdir('/work/asi16')

## 1. Read in all in vitro RPM orgnanoid data

In [None]:
# Read in RPM TBOs in vitro + Cre 10X matrices aligned to CellTag/GFP.CDS genome + Luciferase/Venus
rpm_org_cre=sc.read_10x_mtx('042024_custom_count_RPMTBO_CMV_Pool1/outs/per_sample_outs/RPM/count/sample_filtered_feature_bc_matrix/', var_names='gene_symbols', cache=True)


In [None]:
# Add appropriate metadata to samples
rpm_org_cre.obs['Genotype'] = 'RPM'
rpm_org_cre.obs['Model'] = 'Organoid'
rpm_org_cre.obs['Cre'] = 'Cre'
rpm_org_cre.obs['UnID'] = 'RPM_Org_Cre'
rpm_org_cre.obs['Batch'] = 'Org_Cre'


In [None]:
rpm_org_cre

In [None]:
orgs_nocre=sc.read_10x_mtx('TBO_Pool_NoCre_NotCellPlexed/042024_custom_count_TBOpoolNoCellPlex/outs/filtered_feature_bc_matrix/', var_names='gene_symbols', cache=True)


In [None]:
#Subsample bc >20k cells, keep 3k
orgs_nocre_subset=sc.pp.subsample(orgs_nocre, n_obs=3000, copy=True)

In [None]:
# Add metadata ID's
orgs_nocre_subset.obs['Genotype'] = 'WT'
orgs_nocre_subset.obs['Model'] = 'Organoid'
orgs_nocre_subset.obs['Cre'] = 'No_Cre'
orgs_nocre_subset.obs['UnID'] = 'WT_Org_NoCre'
orgs_nocre_subset.obs['Batch'] = 'Org_No_Cre'

## 2. Concatenate WT and transformed organoid datasets

In [None]:
# Add the No Cre RPM to Plus Cre organoid samples and check it
#Concatenate datasets
orgs_all= rpm_org_cre.concatenate([orgs_nocre_subset], index_unique=None, join="outer")

## 3. Perform QC

In [None]:
#QC filtering
orgs_all.var['mito'] = orgs_all.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(orgs_all, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)
#QC filtering RPMA RPM only scanpy flow
sc.pp.filter_cells(orgs_all, min_genes=200)
#sc.pp.filter_genes(orgs_all, min_cells=3)

orgs_all.var['mito'] = orgs_all.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(orgs_all, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(orgs_all, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(orgs_all, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(orgs_all, x='total_counts', y='n_genes_by_counts')

In [None]:
#Filter data by slicing anndata object
orgs_all = orgs_all[orgs_all.obs.n_genes_by_counts < 7000, :]
orgs_all = orgs_all[orgs_all.obs.n_genes_by_counts > 500, :]
orgs_all = orgs_all[orgs_all.obs.total_counts > 2000, :]
orgs_all = orgs_all[orgs_all.obs.pct_counts_mito < 15, :]

In [None]:
orgs_all.obs.groupby(["UnID"]).apply(len)

In [None]:
#Prep for HVG and scvi
#log1p the data
orgs_all.obs["log1p_total_counts"] = np.log1p(orgs_all.obs["total_counts"])
#Create layers
orgs_all.layers["counts"] = orgs_all.X.copy()
orgs_all.layers['norm'] = orgs_all.X.copy(); sc.pp.normalize_total(orgs_all, target_sum=1e4, layer="norm")

In [None]:
# ID HVG via Scanpy (can cluster on them or cluster on all genes)
# Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    orgs_all,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)


## 4. Set up and train model (scvi)

In [None]:
scvi.model.SCVI.setup_anndata(
    orgs_all,
    layer="counts",
    batch_key="Batch",
    continuous_covariate_keys=["pct_counts_mito"]
)


In [None]:
model = scvi.model.SCVI(orgs_all)
model.train()

In [None]:
#Fit model to data
#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()
orgs_all.obsm["X_scVI_1.1"] = latent

## 5. Perform leiden clustering

In [None]:
#Calculate neighbors using scVI model input
sc.pp.neighbors(orgs_all, use_rep="X_scVI_1.1")
sc.tl.umap(orgs_all, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(orgs_all, key_added="leiden_scVI_1.1", resolution=0.5)

## 6. Visualize clustering

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sc.pl.umap(orgs_all, color="Cre", cmap="cmo.matter", s=30, ax=ax, vmax="p99.99", frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 6))
sc.pl.umap(orgs_all, color="leiden_scVI_1.1", legend_loc="on data", legend_fontsize='large',ax=ax, s=30, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 6))
sc.pl.umap(orgs_all, color="UnID", legend_loc="right margin", ax=ax, s=30, frameon=False, save=False)

#Additional QC bar graphs
orgs_all.obs['cluster'] = orgs_all.obs["leiden_scVI_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=orgs_all.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=orgs_all.obs, x="cluster", y="pct_counts_mito", ax=ax)


In [None]:
# Exclude 7 as doublets
bad_clust=['7']

#Filter out bad clusters
to_keep=(~orgs_all.obs['leiden_scVI_1.1'].isin(bad_clust))

#Copy over to new anndata object
orgs_all2 = orgs_all[to_keep].copy()

### From here, continue iterating through runs of scvi modeling until no clear low quality cell clusters or non-tumor cells are observed.
### Start back up at "set up and train scvi model" and run through subsetting out "bad clusters".
### Each time clusters are removed, model is run again to recluster.

## ITERATION 2 (Final iteration)

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    orgs_all2,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)

In [None]:
scvi.model.SCVI.setup_anndata(
    orgs_all2,
    layer="counts",
    batch_key='Batch',
    continuous_covariate_keys=["pct_counts_mito"]
)
model = scvi.model.SCVI(orgs_all2)
model.train()

In [None]:
latent = model.get_latent_representation()
orgs_all2.obsm["X_scVI_1.2"] = latent

In [None]:
#Calculate neighbors using scVI model input
sc.pp.neighbors(orgs_all2, use_rep="X_scVI_1.2")
sc.tl.umap(orgs_all2, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(orgs_all2, key_added="leiden_scVI_1.2", resolution=0.5)



In [None]:
# Visualize. Extended Data Fig 4a

fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(orgs_all2, color="leiden_scVI_1.2", legend_loc="right margin", legend_fontsize='xx-large',ax=ax, s=60, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(orgs_all2, color="Cre", legend_loc="right margin", ax=ax, s=60, frameon=False, save=False, palette=['purple','orange'])
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(orgs_all2, color="UnID", legend_loc="right margin", ax=ax, s=30, frameon=False, save=False)

In [None]:
orgs_all2.obs.groupby(["UnID"]).apply(len)

In [None]:
#Additional QC bar graphs
orgs_all2.obs['cluster'] = orgs_all2.obs["leiden_scVI_1.2"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=orgs_all2.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=orgs_all2.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# Dot plot key cell type markers (Extended Data Fig. 4b)

more_types=[ "Pdpn","Cav1","Cav2","Hopx","Timp3","Sema3f","Serpine1", #AT1
              "Abca3","Muc1","Sftpa1","Sftpb","Sftpd","Scd1", #AT2
              "Scgb1a1","Cyp2f2","Scgb3a2", "Scgb3a1","Lypd2",#Club
              "Muc5ac","Muc5b", "Spdef",# Goblet
              "Tubb4a","Foxa3","Foxj1","Rfx2","Rfx3","Trp73", #Ciliated
              'Krt5', 'Krt17','Krt15','Krt8','Trp63','Sox2','Id1','Epas1','Aqp3',
            'Sfn','Perp','Fxyd3','Sdc1','Gstm2','F3','Adh7', # Basal
              'Ascl1','Bex1','Insm1','Chga','Chgb','Myt1','Sez6','Foxa2','Sox11','Syp',#NE/neuronal
            "Neurod1","Nhlh1","Nhlh2",
              'Pou2f3','Trpm5','Ascl2',
              'Lrmp','Gng13','Ltc4s','Alox5ap','Avil','Alox5','Atp2a3', #tuft
              "Cftr",'Foxi1', "Ascl3", 'Stap1','Atp6v1c2','Pparg','Rasd1', #ionocyte
              'Yap1','Wwtr1','Sox2','Cd44','Hes1', # Stem-like
             "Top2a","Mki67", "Ube2c","Aspm",
            'Myc',"fLuc"] # Tumor markers

sc.set_figure_params(scanpy=True, fontsize=20) 
sc.pl.dotplot(
    orgs_all2,figsize=[28,3],
    var_names=more_types,
    groupby='leiden_scVI_1.2',
    use_raw=False,
    layer="norm",show=False,
    color_map="cmo.dense", var_group_rotation=35,
    save=False, dendrogram=True) #var_group_labels=["AT1","AT2",'Club','Goblet','Ciliated','Basal','Neuroendocrine','Tuft','Ionocyte','Lung lineage','Stem-like','Tumor markers','CellTag'], 
    #var_group_positions=[(0,6),(7,12),(13,17),(18,19),(20,25),(26,41),(42,50),(51,62),(63,69),(70,72),(73,77),(78,81),(82,83)]

In [None]:
orgs_all2.write_h5ad("061824_RPMTBO_OrgsOnly_Fig3c.h5ad")

In [None]:
orgs_all2 = sc.read_h5ad("061824_RPMTBO_OrgsOnly_Fig3c.h5ad")

In [None]:
#Generate signatures from Extended Data Fig. 4a data
sc.pp.normalize_total(orgs_all2)
sc.pp.log1p(orgs_all2)
sc.tl.rank_genes_groups(orgs_all2, 'leiden_scVI_1.2', method='t-test')


In [None]:
# Extract top 500 marker genes for leiden clusters from data 
sc.tl.rank_genes_groups(orgs_all2,'leiden_scVI_1.2', method='wilcoxon', n_genes=500)

result = orgs_all2.uns['rank_genes_groups']
groups = result['names'].dtype.names
markergenes=pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']})


In [None]:
markergenes.to_csv('/hpc/home/asi16/RPM_WTvsCre_Leiden_scRNAseq_100724.csv' )

# New analysis to include organoids and allograft tumor: RPM

## Fig 2 and Extended Data Fig 4

## 1. Read in RPM basal-organoid-derived allograft data

In [None]:
######### Now cluster in with RPM TBO Allograft tumor! ############

In [None]:
# 040824 Read in re-aligned RPM TBO Allo samples
RPM_allo=sc.read_10x_mtx('MedGenome_FASTQ_123123/RPM_TBO_Allo/042024_RPM_TBO_Allo_CustomCount/outs/filtered_feature_bc_matrix', var_names='gene_symbols', cache=True)
RPM_allo2=sc.read_10x_mtx('02_2024_TBO_Analyses/Old_Xeno/042024_custom_RPMTBO_FXeno_Old/outs/filtered_feature_bc_matrix', var_names='gene_symbols', cache=True)


In [None]:
RPM_allo.obs['Genotype'] = 'RPM'
RPM_allo.obs['Model'] = 'Allograft'
RPM_allo.obs['Cre'] = 'Cre'
RPM_allo.obs['UnID'] = 'RPM_Allo'
RPM_allo.obs['Batch'] = 'RPM_Allo_New'

RPM_allo2.obs['Genotype'] = 'RPM'
RPM_allo2.obs['Model'] = 'Allograft'
RPM_allo2.obs['Cre'] = 'Cre'
RPM_allo2.obs['UnID'] = 'RPM_Allo'
RPM_allo2.obs['Batch'] = 'RPM_Allo_Old'


## 2. Concatenate allograft data with organoid data

In [None]:
# Add to organoid samples and check it
#Concatenate datasets
adata= orgs_all2.concatenate([RPM_allo, RPM_allo2], index_unique=None, join="outer")

In [None]:
adata.obs.groupby(["Batch"]).apply(len)

In [None]:
adata.obs.groupby(["UnID"]).apply(len)

## 3. Perform QC

In [None]:
#QC filtering
adata.var['mito'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
#QC filtering RPMA RPM only scanpy flow
sc.pp.filter_cells(adata, min_genes=200)

adata.var['mito'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mito'], percent_top=None, log1p=False, inplace=True)

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mito'],
             jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

In [None]:
#Filter data by slicing anndata object
adata = adata[adata.obs.n_genes_by_counts < 9000, :]
adata = adata[adata.obs.n_genes_by_counts > 500, :]
adata = adata[adata.obs.total_counts > 2000, :]
adata = adata[adata.obs.pct_counts_mito < 15, :]


In [None]:
#Prep for HVG and scvi
#log1p the data
adata.obs["log1p_total_counts"] = np.log1p(adata.obs["total_counts"])

In [None]:
#Create layers
adata.layers["counts"] = adata.X.copy()
adata.layers['norm'] = adata.X.copy(); sc.pp.normalize_total(adata, target_sum=1e4, layer="norm")

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)

## 4. Set up and train model (scvi)

In [None]:
scvi.model.SCVI.setup_anndata(
    adata,
    layer="counts",
    batch_key='Batch',
    continuous_covariate_keys=["pct_counts_mito"]
)


In [None]:
model = scvi.model.SCVI(adata)

In [None]:
model.train()

## 5. Perform leiden clustering

In [None]:
#Fit model to data
#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata.obsm["X_scVI_1.1"] = latent

#Calculate neighbors using scVI model input
sc.pp.neighbors(adata, use_rep="X_scVI_1.1")
sc.tl.umap(adata, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata, key_added="leiden_scVI_1.1", resolution=1.0)

## 6. Visualize data

In [None]:
#QC UMAPs
sc.pl.umap(
    adata,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata, color="Genotype", cmap="cmo.matter", s=10, ax=ax, vmax="p99.99", frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata, color="leiden_scVI_1.1", legend_loc="on data", legend_fontsize='xx-large',ax=ax, s=10, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata, color="UnID", legend_loc="right margin", ax=ax, s=10, frameon=False, save=False,
           palette={'RPM_Org_Cre' :'purple','WT_Org_NoCre':'orange', 'RPM_Allo' :'teal'}, title='Condition')

#Additional QC bar graphs
adata.obs['cluster'] = adata.obs["leiden_scVI_1.1"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
#feature plots
more_types=["Col14a1", "Acta2","Myh11","Tagln","Mustn1", #fibroblast
              "Lpl","Lipa","Pparg","Plin2","Ear1","Fabp1","Spp1", #lipofibroblast/osteoblastic
              "Ptprc","Mertk","Marco","Mrc1","Ly75","Adgre1",
            "Itgax","Cd68","Csf1r","Mafb","Msr1","Arg1","Adgre4","Clec4a1", #Macs/Myeloid
              "Cx3cr1","Itgam","Cd14", #Monocytes
              "S100a9","S100a8","Mmp9","Csf3r","Cxcr2","Ly6g", #Neuts
              "Batf3","Xcr1","Clec9a","Ccl17","Ccl22", #DC
              "Cd3d","Cd3g","Cd28","Cd8a","Cd4","Foxp3", # Tcell
              "Gzma","Ncr1","Gzmb", #NK
              "Fcmr","Fcer2a","Pax5","Cd22","Cd79b","Cd79a", #B cells
              "Slamf7", "Prdm1", #Plasma
              "Mcam","Pecam1","Icam2","Cd36","Cd93", "Ascl1","Neurod1","Pou2f3", "fLuc", "GFP.CDS","CellTag.UTR"] #Endothelial

sc.pl.umap(
    adata,
    color=more_types,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.dense",
    ncols=4,s=30,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
# Find cluster markers for each leiden cluster to aid filtering
sc.tl.rank_genes_groups(adata, 'leiden_scVI_1.1', method='wilcoxon', layer='norm', use_raw=False)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(50)

In [None]:
#Identify and subset out low qual, plot doublets, and obvious non-tumor clusters (ptprc+)
# cluster 14, 21, 20 = immune
# cluster 18=stroma , cluster 19 = endothelial, 

bad_clust=['14','21','20','18','19']

#Filter out bad clusters
to_keep=(~adata.obs['leiden_scVI_1.1'].isin(bad_clust))

#Copy over to new anndata object
adata_2 = adata[to_keep].copy()

### From here, continue iterating through runs of scvi modeling until no clear low quality cell clusters or non-tumor cells are observed.
### Start back up at "set up and train scvi model" and run through subsetting out "bad clusters".
### Each time clusters are removed, model is run again to recluster.

## ITERATION 2 (Final iteration for Fig. 2d)

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata_2,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)


In [None]:

scvi.model.SCVI.setup_anndata(
    adata_2,
    layer="counts",
    batch_key='Batch',
    continuous_covariate_keys=["pct_counts_mito"]
)
model = scvi.model.SCVI(adata_2)
model.train()

In [None]:
#Fit model to data
#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata_2.obsm["X_scVI_1.2"] = latent

In [None]:
#Calculate neighbors using scVI model input
sc.pp.neighbors(adata_2, use_rep="X_scVI_1.2")
sc.tl.umap(adata_2, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata_2, key_added="leiden_scVI_1.2", resolution=0.75)

In [None]:
#QC UMAPs
sc.pl.umap(
    adata_2,
    color=["n_genes_by_counts", "total_counts", "pct_counts_mito", "log1p_total_counts"],
    cmap="cubehelix_r",
    s=3,
    ncols=2,
)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata_2, color="Genotype", cmap="cmo.matter", s=10, ax=ax, vmax="p99.99", frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata_2, color="leiden_scVI_1.2", legend_loc="on data", legend_fontsize='xx-large',ax=ax, s=10, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(6, 6))
sc.pl.umap(adata_2, color="UnID", legend_loc="right margin", ax=ax, s=10, frameon=False, save=False,
           palette={'RPM_Org_Cre' :'purple','WT_Org_NoCre':'orange', 'RPM_Allo' :'teal'}, title='Condition')

#Additional QC bar graphs
adata_2.obs['cluster'] = adata_2.obs["leiden_scVI_1.2"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_2.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_2.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
#feature plots
more_types=["Col14a1", "Acta2","Myh11","Tagln","Mustn1", #fibroblast
              "Lpl","Lipa","Pparg","Plin2","Ear1","Fabp1","Spp1", #lipofibroblast/osteoblastic
              "Ptprc","Mertk","Marco","Mrc1","Ly75","Adgre1",
            "Itgax","Cd68","Csf1r","Mafb","Msr1","Arg1","Adgre4","Clec4a1", #Macs/Myeloid
              "Cx3cr1","Itgam","Cd14", #Monocytes
              "S100a9","S100a8","Mmp9","Csf3r","Cxcr2","Ly6g", #Neuts
              "Batf3","Xcr1","Clec9a","Ccl17","Ccl22", #DC
              "Cd3d","Cd3g","Cd28","Cd8a","Cd4","Foxp3", # Tcell
              "Gzma","Ncr1","Gzmb", #NK
              "Fcmr","Fcer2a","Pax5","Cd22","Cd79b","Cd79a", #B cells
              "Slamf7", "Prdm1", #Plasma
              "Mcam","Pecam1","Icam2","Cd36","Cd93", "Ascl1","Neurod1","Pou2f3", "fLuc", "GFP.CDS","CellTag.UTR"] #Endothelial

sc.pl.umap(
    adata_2,
    color=more_types,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.dense",
    ncols=4,s=30,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
adata_2.obs_names_make_unique()

In [None]:
# Export and use to import to R/Seurat for signature analysis
adata_2.write_h5ad("090624_RPM_WT_Org_Allo_Fig2d.h5ad")

In [None]:
adata_2=sc.read_h5ad("090624_RPM_WT_Org_Allo_Fig2d.h5ad")

# New analysis of just RPM basal-derived allograft (tumor cells only)

## Fig 2e and Extended Data Fig. 4e

In [None]:
#Subset out just RPM Allo for downstream analysis/leiden clusters

keep=['RPM_Allo']

#Filter out bad clusters
to_keep=(adata_2.obs['UnID'].isin(keep))

#Copy over to new anndata object
adata_3 = adata_2[to_keep].copy()


In [None]:
adata_3.obs.groupby(["Batch"]).apply(len)

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata_3,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)


## 1. Set up and train model (scvi)

In [None]:
scvi.model.SCVI.setup_anndata(
    adata_3,
    layer="counts",
    batch_key='Batch',
    continuous_covariate_keys=["pct_counts_mito"]
)
model = scvi.model.SCVI(adata_3)
model.train()


In [None]:
#Fit model to data
#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata_3.obsm["X_scVI_1.3"] = latent


## 2. Perform leiden clustering

In [None]:
#Calculate neighbors using scVI model input
sc.pp.neighbors(adata_3, use_rep="X_scVI_1.3")
sc.tl.umap(adata_3, min_dist=0.5)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata_3, key_added="leiden_scVI_1.3", resolution=0.5)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sc.pl.umap(adata_3, color="leiden_scVI_1.3", legend_loc="right margin", legend_fontsize="large",ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_3, color="Batch", legend_loc="right margin", ax=ax, s=30, frameon=False, save=False, title='Sample ID')


In [None]:
adata_3.write_h5ad("092424_RPM_AlloOnly_Fig2.h5ad")

In [None]:
adata_3=sc.read_h5ad("092424_RPM_AlloOnly_Fig2.h5ad")

In [None]:
#Additional QC bar graphs
adata_3.obs['cluster'] = adata_3.obs["leiden_scVI_1.3"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_3.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_3.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
# Remove clusters 4 and 9, low quality. 

bad_clust=['4','9']

#Filter out bad clusters
to_keep=(~adata_3.obs['leiden_scVI_1.3'].isin(bad_clust))

#Copy over to new anndata object
adata_4 = adata_3[to_keep].copy()


## ITERATION 2 (Final iteration for Fig 2e, RPM allograft tumor cells only)

In [None]:
#HVG via Scanpy
#Note here that if you run with a batch_key with few cells, will get b'reciprocal condition number error
sc.pp.highly_variable_genes(
    adata_4,n_top_genes=10000,
    subset=False,
    layer="counts",
    flavor="seurat_v3",
    batch_key="Batch"
)

In [None]:
scvi.model.SCVI.setup_anndata(
    adata_4,
    layer="counts",
    batch_key='Batch',
    continuous_covariate_keys=["pct_counts_mito"]
)

In [None]:
model = scvi.model.SCVI(adata_4)
model.train()

In [None]:
#Fit model to data
#Get latent representation of model to apply to UMAP
latent = model.get_latent_representation()

adata_4.obsm["X_scVI_1.4"] = latent


In [None]:
#Calculate neighbors using scVI model input
sc.pp.neighbors(adata_4, use_rep="X_scVI_1.4")
sc.tl.umap(adata_4, min_dist=.75)

#Run leiden clustering based on neighbors
sc.tl.leiden(adata_4, key_added="leiden_scVI_1.4", resolution=0.5)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sc.pl.umap(adata_4, color="leiden_scVI_1.4", legend_loc="right margin", legend_fontsize="large",ax=ax, s=40, frameon=False, save=False)
fig, ax = plt.subplots(figsize=(8, 8))
sc.pl.umap(adata_4, color="Batch", legend_loc="right margin", ax=ax, s=30, frameon=False, save=False, title='Sample ID')


In [None]:
adata_4.obs.groupby(["leiden_scVI_1.4"]).apply(len)

In [None]:
# Save and export to use in R/Seurat for signature analysis, etc. 
adata_4.write_h5ad("092824_RPM_Allo_Only_StringentQC_new.h5ad")

In [None]:
adata_4=sc.read_h5ad("092824_RPM_Allo_Only_StringentQC_new.h5ad")

In [None]:
#Additional QC bar graphs
adata_4.obs['cluster'] = adata_4.obs["leiden_scVI_1.4"].copy()

#Plot Log1p total counts
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_4.obs, x="cluster", y="log1p_total_counts", ax=ax)

#Plot Pct counts mito
fig, ax = plt.subplots(figsize=(20,6))
sns.boxenplot(data=adata_4.obs, x="cluster", y="pct_counts_mito", ax=ax)

In [None]:
#feature plots
more_types=["Ascl1", "Syp","Chga","Insm1", #NE
              "Neurod1","Nhlh1","Nhlh2","Neurod2", #neuronal
              "Pou2f3","Ascl2","Avil","Gng13", #tuft
           "Trp63","Krt5","Krt15","Krt17",#basal
           "Yap1","Vim","Cd44","Wwtr1"] #Mesenchymal/Yap

sc.pl.umap(
    adata_4,
    color=more_types,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.dense",
    ncols=4,s=50,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
# Dot plot for Extended Data Fig. 4e
more_types=["Trp63","Krt5","Krt15","Krt17",#basal
            "Ascl1", "Syp","Chga","Insm1","Chgb","Myt1","Sez6","Foxa2","Mycl", #NE
              "Neurod1","Nhlh1","Nhlh2","Neurod2", #neuronal
            'Pou2f3','Trpm5','Ascl2','Lrmp','Gng13','Avil','Alox5','Atp2a3', #tuft
              "Cftr",'Foxi1', "Ascl3", 'Stap1','Pparg', #ionocyte
              'Yap1','Wwtr1','Sox2','Cd44','Hes1',"Vim", # Stem-like/#Mesenchymal/Yap
             "Top2a","Mki67", "Ube2c","Aspm", #Proliferation
            'Myc',"fLuc"] #tumor markers

sc.set_figure_params(scanpy=True, fontsize=17) 
sc.pl.dotplot(
    adata_4,figsize=[15,2.5],
    var_names=more_types,
    groupby='leiden_scVI_1.4',
    use_raw=False,
    layer="norm",show=False,
    color_map="cmo.dense", var_group_rotation=35,smallest_dot=10,
    save=False) 

In [None]:
#feature plots

more_types=["Ascl1","Neurod1","Pou2f3","Yap1","Trp63"]

FIGSIZE = (4, 3)
rcParams["figure.figsize"] = FIGSIZE

sc.pl.umap(
    adata_4,
    color=more_types,sort_order=True,
    use_raw=False,
    legend_loc= "on data",
    color_map="cmo.dense",
    ncols=3,s=40,
    frameon=False,
    vmax="p99.5",
    layer="norm",
    save=False
)

In [None]:
#Generate signatures from these data for supplemental table
sc.pp.normalize_total(adata_4)
sc.pp.log1p(adata_4)

# Extract top 500 marker genes for leiden clusters from data 
sc.tl.rank_genes_groups(adata_4,'leiden_scVI_1.4', method='wilcoxon', n_genes=500)

result = adata_4.uns['rank_genes_groups']
groups = result['names'].dtype.names
markergenes=pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']})

markergenes.to_csv('/hpc/home/asi16/RPM_AlloONLY_Leiden_scRNAseq_100724_2.csv' )