### Load and plot resident cell types

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scvi
import seaborn as sns
import os,sys
import anndata
import scvi
import anndata as ad

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100

from datetime import date
today = str(date.today())
sc.logging.print_header()

In [None]:
import matplotlib.pyplot as plt
from matplotlib import font_manager
font_manager.fontManager.addfont("...software/Arial.ttf")
print(font_manager.findfont("Arial"))
from matplotlib import rcParams
plt.rcParams["font.sans-serif"] = ["Arial"]
plt.rcParams["pdf.fonttype"] = 42
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')

In [None]:
model_folder = '...Figure4/scvi_models/'
figures = '...Figure_4_nat/figures/'
results = '...Figure_4_nat/'

In [None]:
clean_folder = '...Figure_1_nat/clean'

In [None]:
major_mrkrs = ['PTPRC', 'CD3G', 
               'HOXA9', 
               'HES1', #DN
               'RORC', #DP
               'CCR9',
              'PDCD1', #CD8aa
              'CD8A', 'CD8B', 
              'CD4', 'CD40LG', 
              'CRTAM', 'ANXA1', #memory
             'FOXP3', 'DHRS3', 
               'TRDC', #gama-delta 
              'EOMES', 'KLRD1', # NK cells
              'TNFSF11', # ILC
              'CD19', 'VPREB1', # pre-pro B cells
              'IGHD', 'IGHA1', 'IGHG1', 
              'CLEC9A', 'CLEC10A', 
              'LAMP3', 
             'LILRA4','S100A9', 'C1QA', 'TPSB2', 'ITGA2B', 'GYPA', 
              'PDGFRA', 'COLEC11', 'FBN1', 
              'RGS5', 'CDH5', 
               'LYVE1', 'EPCAM', 'PSMB11', 'DLK2', 'KRT14', 
               'CCL19', 'AIRE', 'KRT1', 'POU2F3',
               'MYOD1', 'NEUROG1']

In [None]:
def run_scvi2(adata, batch_hv="age_group", batch_scvi="sample", \
             cat_cov_scvi=["DonorID", "10X_version", "Sex", "Age_group"], cont_cov_scvi=["percent_mito"], \
             include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
             exclude_vdjgenes = True, remove_cite = False, hvg = 3000, max_epochs = 350, vae_name="", **kwargs):
    adata_scvi = sc.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.var.copy())
    #adata_scvi = adata_scvi[:,0:-150] # remove cite genes
    # adata_scvi.layers["counts"] = adata_scvi.X.copy()
    # sc.pp.normalize_total(adata_scvi, target_sum=1e4)
    # sc.pp.log1p(adata_scvi)
    # keep full dimension safe
    #    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=10000, layer="counts",\
    #                                batch_key="Age_group")
    if remove_cite:
        print('removing CITEseq genes pre SCVI')
        adata_scvi = adata_scvi[:,~adata_scvi.var['cite']].copy() # remove cite genes
    gene_list = adata_scvi.var_names.tolist()
    if exclude_cc_genes:
        cell_cycle_genes = [x.strip() for x in open('/nfs/team205/vk8/processed_data/regev_lab_cell_cycle_genes.txt')]
        [gene_list.remove(i) for i in cell_cycle_genes if i in gene_list]
    if exclude_mt_genes:
        mt_genes = adata.var_names[adata.var_names.str.startswith('MT-')]
        [gene_list.remove(i) for i in mt_genes if i in gene_list]
    if exclude_vdjgenes:
        import re
        [gene_list.remove(i) for i in gene_list if re.search('^TR[AB][VDJ]|^IG[HKL][VDJC]', i)]
    
    print('Removed excluded genes')
    adata_scvi = adata_scvi[:,gene_list].copy()
    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=hvg, batch_key=batch_hv)
    selected_genes = list(set(adata_scvi.var.loc[adata_scvi.var['highly_variable']].index.tolist() + include_genes))
    adata_scvi = adata_scvi[:, selected_genes].copy()
    print(f'Highly variable genes selected in total {adata_scvi.shape}')
    scvi.model.SCVI.setup_anndata(adata_scvi, batch_key=batch_scvi,
                                  categorical_covariate_keys=cat_cov_scvi,
                                  continuous_covariate_keys=cont_cov_scvi)
    scvi_kwargs = {k: v for k, v in kwargs.items() if k in scvi.model.SCVI.__init__.__code__.co_varnames}
    vae = scvi.model.SCVI(adata_scvi, **scvi_kwargs)
   # vae = scvi.model.SCVI(adata_scvi,n_layers=2, n_latent=30)
    train_kwargs = {k: v for k, v in kwargs.items() if k in vae.train.__code__.co_varnames}
    vae.train(**train_kwargs)
    # adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    #   adata_raw_scvi = adata.copy()
    print('scvi model trained')
    adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
    sc.tl.leiden(adata_scvi, resolution = 1, key_added = "leiden_r1.0")
    sc.tl.leiden(adata_scvi, resolution = 1.2, key_added = "leiden_r1.2")
    sc.tl.leiden(adata_scvi, resolution = 1.4, key_added = "leiden_r1.4")
    sc.tl.leiden(adata_scvi, resolution = 1.6, key_added = "leiden_r1.6")
    #   sc.tl.umap(adata_scvi)
    sc.tl.umap(adata_scvi)
    print('DR and clustering performed')
    adata_raw_scvi = adata.copy()
    adata_raw_scvi.obsm['X_scVI'] = adata_scvi.obsm['X_scVI'].copy()
    adata_raw_scvi.obsm['X_umap'] = adata_scvi.obsm['X_umap'].copy()
    adata_raw_scvi.obsp = adata_scvi.obsp.copy()
    adata_raw_scvi.uns = adata_scvi.uns.copy()
    adata_raw_scvi.obs[["leiden_r1.0", "leiden_r1.2", "leiden_r1.4", "leiden_r1.6"]] = adata_scvi.obs[["leiden_r1.0", "leiden_r1.2", "leiden_r1.4", "leiden_r1.6"]].copy()
    sc.pp.normalize_total(adata_raw_scvi, target_sum = 1e4)
    sc.pp.log1p(adata_raw_scvi)
    sc.pl.umap(
        adata_raw_scvi,
        color=['cell_type_level_4'], legend_loc = "on data", legend_fontsize = 4, frameon=False,
        ncols=2,
    )
    sc.pl.umap(
        adata_raw_scvi,
        color=['age_group', 'sex', 'donor'], frameon = False, ncols = 2)
    sc.pl.umap(
        adata_raw_scvi,
        color=['study', 'chemistry_simple', 'doublet_score'])
    #cat_cov_str = "-".join(cat_cov_scvi)
    #    vae.save(f"/nfs/team205/vk8/scripts/scvi/Results/scvi_models/{vae_name}_scvi_batch{batch_scvi}_cat{cat_cov_str}_cont{cont_cov_scvi}_10000hvgenes_{today}", save_anndata = True)
    results = {}
    results['data'] = adata_raw_scvi
    results['vae'] = vae
    return (results)

In [None]:
vasc_mrkrs = ['PECAM1', 'CLDN5', 'VWF', 'CLU', # pan-endothelial
                    'IGFBP3', 'HEY1', 'SEMA3G', 'MGP', 'GJA5', #arterial
                    'RGCC', 'FABP4', 'LGALS1', 'CD36', 'CA4', 'LPL', # capillary
                    'ACKR1', 'PLVAP', 'CCL14', 'ICAM1', 'SELE', 'RND1', 'RAMP3', # venous
                     'CCL2', 'GADD45B', 'SLC2A3', 'IRF1', 'ATF3', 'CDKN1A', # response
                    'CD8A', 'LYVE1', # lymphatic ec
                    'ACTA2', 'TAGLN', 'TPM2', 'MYH11', 'PLN', 'SORBS2', 'LBH', 'ATF3', 'IGFBP6', 'CNN1', 'RERGL', #
                    'CLU',  'RAMP1', # smc
                    'RGS5', 'APOD', 'ABCC9', 'STEAP4']

In [None]:
def population_plots(adata_scvi, mrkrs_dict, save_name, figures, cat_order, anno = 'cell_type_level_4_upd', 
                     anno_cov = ['study', 'sex', 'age_group', 'chemistry_simple']):
    adata_scvi.obs[anno] = adata_scvi.obs[anno].cat.reorder_categories(cat_order).copy()
    if save_name: 
        sc.pl.umap(adata_scvi, color = anno, legend_loc = "on data", legend_fontsize = 5, 
                   frameon = False, save = f"{save_name}_scvi_celltype.pdf")
        sc.pl.umap(adata_scvi, color = anno_cov, wspace = 0.3, frameon = False, 
                   save = f"{save_name}_scvi_covariates.pdf")
        dp = sc.pl.dotplot(adata_scvi, groupby = anno, var_names = mrkrs_dict, 
                          standard_scale = "var", return_fig = True)
        dp = dp.add_totals().style(dot_edge_color='black', dot_edge_lw=0.5, cmap = "Reds")
        # Save the figure
        dp.savefig(f"{figures}{save_name}_HTSA_scvi_mrkrs.pdf")
    else:
        sc.pl.umap(adata_scvi, color = anno, legend_loc = "on data", legend_fontsize = 5, 
                   frameon = False)
        sc.pl.umap(adata_scvi, color = anno_cov, wspace = 0.3, frameon = False)
        dp = sc.pl.dotplot(adata_scvi, groupby = anno, var_names = mrkrs_dict, 
                          standard_scale = "var", return_fig = True)
        dp = dp.add_totals().style(dot_edge_color='black', dot_edge_lw=0.5, cmap = "Reds")
        # Save the figure
        dp.show()

### Analyze ECs

In [None]:
ecs_ordered = ['EC-Art', 'EC-Art-ELN', 'EC-Cap', 'EC-Cap-Prolif', 'EC-Ven', 'EC-Ven-ELN', 'EC-Lymphatic']

In [None]:
adata_all = sc.read(f"{clean_folder}/adata_full_rev_4_clean.h5ad")

In [None]:
adata_vasc = adata_all[adata_all.obs['cell_type_level_4_explore'].isin(ecs_ordered),:].copy()

In [None]:
adata_vasc.obs['age_group'].value_counts()

In [None]:
adata_vasc.obs['sex'].value_counts()

In [None]:
#adata_fbs.obs['age_group2'] = adata_fbs.obs['age_group'].apply(lambda x: 'prenatal' if x == 'fetal' else 'postnatal') 
samples = adata_vasc.obs['sample'].value_counts().index[adata_vasc.obs['sample'].value_counts()>5].tolist()

In [None]:
adata_vasc.obs['sample'].value_counts()>5

In [None]:
adata_vasc.layers['counts'] = adata_vasc.X.copy()

### note study is excluded from the covariates

In [None]:
adata_vasc_scvi_run = run_scvi2(adata_vasc[adata_vasc.obs['sample'].isin(samples),],  batch_hv="age_group", hvg = 5000, batch_scvi='sample', 
          cat_cov_scvi=['chemistry_simple','age_group','donor', 'sex'], cont_cov_scvi = None, include_genes=[], 
          exclude_cc_genes=True, exclude_vdjgenes=True, exclude_mt_genes = True, 
          n_layers=2, n_latent=30, max_epochs=350, batch_size=2000)

In [None]:
adata_vasc_scvi = adata_vasc_scvi_run['data'].copy()

In [None]:
sc.pl.umap(adata_vasc_scvi, color = major_mrkrs)

In [None]:
sc.pl.umap(adata_vasc_scvi, color = vasc_mrkrs)

In [None]:
vasc_mrkrs_dict = {'pan-vasc': ['PECAM1', 'CDH5','VWF'],
                  'arteria': ['CXCL12', 'SEMA3G', 'HEY1'],
                   'large vassels': ['SULF1', 'ELN'],
                  'capillary': ['RGCC', 'PLVAP'],
                   'prolif': ['MKI67'],
                   'venous': ['ACKR1', 'ICAM1', 'CCL2', 'SELE'],
                   'lymphatic': ['PROX1', 'TFF3', 'CCL21']
                  }

In [None]:
population_plots(adata_vasc_scvi, mrkrs_dict = vasc_mrkrs_dict, save_name = None, 
                 figures = '...Figure_4_nat/figures/', 
                 cat_order = ecs_ordered, anno = 'cell_type_level_4_explore', 
                     anno_cov = ['study', 'sex', 'age_group', 'chemistry_simple'])

In [None]:
adata_vasc_scvi.write(f'{clean_folder}/vasc_scvi_updated_{today}.h5ad')

adata_vasc_scvi_run['vae'].save(f'{clean_folder}/models/vasc_scvi_updated_{today}.h5ad')