### Load and plot resident cell types

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import scvi
import seaborn as sns
import os,sys
import anndata
import scvi
os.chdir('/nfs/team205/ny1/ThymusSpatialAtlas/software/ImageSpot/')
import scvi_wrapper as sv
import anndata as ad

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100

from datetime import date
today = str(date.today())
sc.logging.print_header()
## Add this line so the text on pdf is correctly recognised!!!
import matplotlib.pyplot as plt 
from matplotlib import font_manager 
font_manager.fontManager.addfont("/nfs/team205/ny1/ThymusSpatialAtlas/software/Arial.ttf") 
print(font_manager.findfont("Arial")) 
plt.rcParams["font.sans-serif"] = ["Arial"] 
plt.rcParams["pdf.fonttype"] = 42
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')

In [None]:
%cd /nfs/team205/vk8/projects/thymus_atlas/ThymusSpatialAtlas

In [None]:
#model_folder = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure4/scvi_models/'
figures = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure_4_nat/figures/'
results = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure_4_nat/'

In [None]:
clean_folder = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure_1_nat/clean'

In [None]:
tec_mrkrs = ['FOXN1', 'ENPEP', 'LY75', # surface markers
 'CXCL12', 'CCL25', # homing
 'DLL4', 'IL7', 'KITLG', # commitment, proliferation and survival
'PSMB11', 'PRSS16', 'CD83', # processing machinery
'PAX9', 'SIX1', 'HLA-DQB1', 'KRT8', 'KRT18',   # cTEC markers
'DLK2', 'PDPN', 'ZBED2', 'IGFBP5', 'IGFBP6', 'MAOA', 'KRT5', 'KRT8',  'KRT15', 'CCL2',  #'CTGF', #mcTEC markers 
'EPCAM', 
'ASCL1', 'CCL21', 'KRT5', 'KRT14',
'AIRE', 'FEZF2', 'CDKN2A', 'AIRE', 'SLPI', 'CRIP1',
'SLPI', 'IVL', 'KRT1', 'KRT7',  'KRT10',  'CDKN2A', 'SPINK5',  \
'FOXI1',  'ASCL3', 'CFTR', 'CLCNKB', #ionocytes
'SOX9', 'POU2F3', 'DCLK1', 'IL25', # thymic tuft cells
'PLCB2', 'TRPM5', 'GNB3', 'GNG13', # thymic tuft cells, taste signalling pathway
'BEX1', 'NEUROD1', #neuro TEC all
'OLIG1', 'OLIG2', 'NEUROG1', 'NEUROD4', 
'PCP4', 'FOXJ1',  # cilliated cells
'CHRNA1', 'MYOG', 'TTN']

In [None]:
major_mrkrs = ['PTPRC', 'CD3G', 
               'HOXA9', 
               'HES1', #DN
               'RORC', #DP
               'CCR9',
              'PDCD1', #CD8aa
              'CD8A', 'CD8B', 
              'CD4', 'CD40LG', 
              'CRTAM', 'ANXA1', #memory
             'FOXP3', 'DHRS3', 
               'TRDC', #gama-delta 
              'EOMES', 'KLRD1', # NK cells
              'TNFSF11', # ILC
              'CD19', 'VPREB1', # pre-pro B cells
              'IGHD', 'IGHA1', 'IGHG1', 
              'CLEC9A', 'CLEC10A', 
              'LAMP3', 
             'LILRA4','S100A9', 'C1QA', 'TPSB2', 'ITGA2B', 'GYPA', 
              'PDGFRA', 'COLEC11', 'FBN1', 
              'RGS5', 'CDH5', 
               'LYVE1', 'EPCAM', 'PSMB11', 'DLK2', 'KRT14', 
               'CCL19', 'AIRE', 'KRT1', 'POU2F3',
               'MYOD1', 'NEUROG1']

In [None]:
def run_scvi2(adata, batch_hv="age_group", batch_scvi="sample", \
             cat_cov_scvi=["DonorID", "10X_version", "Sex", "Age_group"], cont_cov_scvi=["percent_mito"], \
             include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
             exclude_vdjgenes = True, remove_cite = False, hvg = 3000, max_epochs = 350, vae_name="", **kwargs):
    adata_scvi = sc.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.var.copy())
    #adata_scvi = adata_scvi[:,0:-150] # remove cite genes
    # adata_scvi.layers["counts"] = adata_scvi.X.copy()
    # sc.pp.normalize_total(adata_scvi, target_sum=1e4)
    # sc.pp.log1p(adata_scvi)
    # keep full dimension safe
    #    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=10000, layer="counts",\
    #                                batch_key="Age_group")
    if remove_cite:
        print('removing CITEseq genes pre SCVI')
        adata_scvi = adata_scvi[:,~adata_scvi.var['cite']].copy() # remove cite genes
    gene_list = adata_scvi.var_names.tolist()
    if exclude_cc_genes:
        cell_cycle_genes = [x.strip() for x in open('/nfs/team205/vk8/processed_data/regev_lab_cell_cycle_genes.txt')]
        [gene_list.remove(i) for i in cell_cycle_genes if i in gene_list]
    if exclude_mt_genes:
        mt_genes = adata.var_names[adata.var_names.str.startswith('MT-')]
        [gene_list.remove(i) for i in mt_genes if i in gene_list]
    if exclude_vdjgenes:
        import re
        [gene_list.remove(i) for i in gene_list if re.search('^TR[AB][VDJ]|^IG[HKL][VDJC]', i)]
    
    print('Removed excluded genes')
    adata_scvi = adata_scvi[:,gene_list].copy()
    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=hvg, batch_key=batch_hv)
    selected_genes = list(set(adata_scvi.var.loc[adata_scvi.var['highly_variable']].index.tolist() + include_genes))
    adata_scvi = adata_scvi[:, selected_genes].copy()
    print(f'Highly variable genes selected in total {adata_scvi.shape}')
    scvi.model.SCVI.setup_anndata(adata_scvi, batch_key=batch_scvi,
                                  categorical_covariate_keys=cat_cov_scvi,
                                  continuous_covariate_keys=cont_cov_scvi)
    scvi_kwargs = {k: v for k, v in kwargs.items() if k in scvi.model.SCVI.__init__.__code__.co_varnames}
    vae = scvi.model.SCVI(adata_scvi, **scvi_kwargs)
   # vae = scvi.model.SCVI(adata_scvi,n_layers=2, n_latent=30)
    train_kwargs = {k: v for k, v in kwargs.items() if k in vae.train.__code__.co_varnames}
    vae.train(**train_kwargs)
    # adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    #   adata_raw_scvi = adata.copy()
    print('scvi model trained')
    adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
    sc.tl.leiden(adata_scvi, resolution = 1, key_added = "leiden_r1.0")
    sc.tl.leiden(adata_scvi, resolution = 1.2, key_added = "leiden_r1.2")
    sc.tl.leiden(adata_scvi, resolution = 1.4, key_added = "leiden_r1.4")
    sc.tl.leiden(adata_scvi, resolution = 1.6, key_added = "leiden_r1.6")
    #   sc.tl.umap(adata_scvi)
    sc.tl.umap(adata_scvi)
    print('DR and clustering performed')
    adata_raw_scvi = adata.copy()
    adata_raw_scvi.obsm['X_scVI'] = adata_scvi.obsm['X_scVI'].copy()
    adata_raw_scvi.obsm['X_umap'] = adata_scvi.obsm['X_umap'].copy()
    adata_raw_scvi.obsp = adata_scvi.obsp.copy()
    adata_raw_scvi.uns = adata_scvi.uns.copy()
    adata_raw_scvi.obs[["leiden_r1.0", "leiden_r1.2", "leiden_r1.4", "leiden_r1.6"]] = adata_scvi.obs[["leiden_r1.0", "leiden_r1.2", "leiden_r1.4", "leiden_r1.6"]].copy()
    sc.pp.normalize_total(adata_raw_scvi, target_sum = 1e4)
    sc.pp.log1p(adata_raw_scvi)
    sc.pl.umap(
        adata_raw_scvi,
        color=['cell_type_level_4'], legend_loc = "on data", legend_fontsize = 4, frameon=False,
        ncols=2,
    )
    sc.pl.umap(
        adata_raw_scvi,
        color=['age_group', 'sex', 'donor'], frameon = False, ncols = 2)
    sc.pl.umap(
        adata_raw_scvi,
        color=['study', 'chemistry_simple', 'doublet_score'])
    #cat_cov_str = "-".join(cat_cov_scvi)
    #    vae.save(f"/nfs/team205/vk8/scripts/scvi/Results/scvi_models/{vae_name}_scvi_batch{batch_scvi}_cat{cat_cov_str}_cont{cont_cov_scvi}_10000hvgenes_{today}", save_anndata = True)
    results = {}
    results['data'] = adata_raw_scvi
    results['vae'] = vae
    return (results)

## Load TECs 

In [None]:
adata_all = sc.read("Figure_1_nat/clean/adata_full_rev_4_clean.h5ad")

In [None]:
adata_tec = adata_all[(adata_all.obs['cell_type_level_0'] == 'Epithelial') |
                      (adata_all.obs['cell_type_level_4_explore'] == 'TEC-tuft'),:].copy()

In [None]:
adata_tec.obs['age_group'].value_counts()

In [None]:
adata_tec.obs['sex'].value_counts()

In [None]:
#adata_tec.obs['age_group2'] = adata_tec.obs['age_group'].apply(lambda x: 'prenatal' if x == 'fetal' else 'postnatal') 
samples = adata_tec.obs['sample'].value_counts().index[adata_tec.obs['sample'].value_counts()>5].tolist()

In [None]:
adata_tec.layers['counts'] = adata_tec.X.copy()

### note study is excluded from the covariates as some studies had enriched for particular TEC subtypes

In [None]:
adata_tec_scvi = run_scvi2(adata_tec,  batch_hv="age_group", hvg = 5000, batch_scvi='sample', 
          cat_cov_scvi=['chemistry_simple','age_group','donor', 'sex'], cont_cov_scvi = None, include_genes=[], 
          exclude_cc_genes=True, exclude_vdjgenes=True, exclude_mt_genes = True, 
          n_layers=2, n_latent=30, max_epochs=350, batch_size=2000)

In [None]:
adata_scvi = adata_tec_scvi['data'].copy()

In [None]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.0", "leiden_r1.2", "leiden_r1.4", "leiden_r1.6"], 
           legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
sc.tl.leiden(adata_scvi, resolution = 0.4, restrict_to=['leiden_r1.0', ['9']], key_added = "leiden_r1.0R")

sc.tl.leiden(adata_scvi, resolution = 0.4, restrict_to=['leiden_r1.2', ['7']], key_added = "leiden_r1.2R")

In [None]:
sc.tl.leiden(adata_scvi, resolution = 0.3, restrict_to=['leiden_r1.2R', ['7,4']], key_added = "leiden_r1.2R")

In [None]:
adata_scvi.obs["leiden_r1.0R"].value_counts()

In [None]:
adata_scvi.obs["leiden_r1.2R"].value_counts()

In [None]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.0R","leiden_r1.2R", "PTPRC", "CD3G", "CD4", "CD8A", 'MKI67'], 
           legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
#T cell doublet
sc.pl.dotplot(adata_scvi, var_names = ['PTPRC', 'CD3E', 'CD3G', 'C1QA', 'CD4'], groupby = "leiden_r1.0R", swap_axes = True)

In [None]:
#cTEC-mTEC doublet?
sc.pl.dotplot(adata_scvi, var_names = ['EPCAM', 'AIRE', 'CCL25', 'PRSS12', 'PSMB11', 'LY75'], groupby = "leiden_r1.0R", swap_axes = True)

In [None]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.2R"], groups = ['7,4,0', '7,4,1'],
           legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
sc.pl.dotplot(adata_scvi, var_names = ['MKI67'], groupby = "leiden_r1.2R", swap_axes = True)

In [None]:
sc.pl.dotplot(adata_scvi, var_names = ['FOXI1',  'ASCL3', 'CFTR', 'CLCNKB', #ionocytes
'SOX9', 'POU2F3', 'DCLK1', 'IL25', # thymic tuft cells
'PLCB2', 'TRPM5', 'GNB3', 'GNG13'], groupby = "leiden_r1.2R", swap_axes = True)

In [None]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.2R", "MKI67"], groups = ['7,1'], legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.2R", "MKI67"], groups = ['7,2'], legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
sc.pl.umap(adata_scvi, color = ['FOXI1',  'ASCL3', 'CFTR', 'CLCNKB', #ionocytes
'SOX9', 'POU2F3', 'DCLK1', 'IL25', # thymic tuft cells
'PLCB2', 'TRPM5', 'GNB3', 'GNG13'],  legend_loc = "on data", legend_fontsize = 5, frameon = False)

In [None]:
adata_scvi.obs["leiden_r1.0R"].value_counts()

In [None]:
sum(adata_scvi.obs["leiden_r1.0R"]=="9.4")

In [None]:
adata_scvi_dbrm = adata_scvi[~adata_scvi.obs["leiden_r1.0R"].isin(["9,4", "9,5"]),:].copy()

In [None]:
adata_scvi_dbrm.obs['cell_type_level_4_upd'] = adata_scvi_dbrm.obs.apply(lambda x: 'TEC-ionocytes' if x["leiden_r1.2R"] in ['7,4,0', '7,4,1'] else
'TEC-tuft' if x["leiden_r1.2R"] == '7,4,2' else
'mTECI-Prolif' if x["leiden_r1.2R"]== '7,2' else 
'mTECI-trans' if x['cell_type_level_4'] == 'see_lv4_explore' else 
x['cell_type_level_4'], axis = 1)

In [None]:
adata_scvi_dbrm.write(f'{clean_folder}/TEC_scvi_dbrm_updated_{today}.h5ad')

## Re-load and save with new annos

In [None]:
adata_tec_scvi = sc.read('/nfs/team205/ny1/ThymusSpatialAtlas/Figure_1_nat/clean/TEC_scvi_dbrm_updated_2024-01-06.h5ad')

In [None]:
sc.pl.umap(adata_tec_scvi, color = ['cell_type_level_4_explore', 'cell_type_level_4_upd'], legend_loc = "on data",
           legend_fontsize = 6)

In [None]:
tec_mrkrs = {'pan-cTEC':['PSMB11', 'LY75', 'CCL25', 'HLA-DRA'],
        'cTEC\nsubtypes':['TBATA', 'TP53AIP1', 'DLL4'],
        'mcTEC':['DLK2','IGFBP5', 'IGFBP6','CCN2', 'CCL2',
              'KRT15', 'ITGA6', 'MKI67'],
        'pan-mTEC': ['EPCAM'],
        'mTECI': ['ASCL1','CCL21'],
        'mTECII': ['AIRE', 'FEZF2','CRIP1'], 
        'mTECIII':['SLPI', 'IVL', 'KRT10','CDKN2A'],
        'pan-neuro TEC': ['BEX1', 'NEUROD1'],
        'neuroTEC': ['NEUROG1', 'NEUROD4'], 
        'cilliated TEC': ['PCP4', 'FOXJ1'], 
        'myoTEC':['CHRNA1', 'MYOG', 'TTN'],
        'ionocytes/tuft':['FOXI1', 'CFTR', 'POU2F3','PLCB2']}

In [None]:
def population_plots(adata_scvi, mrkrs_dict, save_name, figures, cat_order, anno = 'cell_type_level_4_upd', 
                     anno_cov = ['study', 'sex', 'age_group', 'chemistry_simple']):
    adata_scvi.obs[anno] = adata_scvi.obs[anno].cat.reorder_categories(cat_order).copy()
    sc.pl.umap(adata_scvi, color = anno, legend_loc = "on data", legend_fontsize = 8, 
               frameon = False, save = f"{save_name}_scvi_celltype.pdf", title = "")
    sc.pl.umap(adata_scvi, color = anno_cov, wspace = 0.3, frameon = False, 
               save = f"{save_name}_scvi_covariates.pdf")
    dp = sc.pl.dotplot(adata_scvi, groupby = anno, var_names = mrkrs_dict, 
                      standard_scale = "var", return_fig = True)
    dp = dp.add_totals().style(dot_edge_color='black', dot_edge_lw=0.5, cmap = "Reds")
    # Save the figure
    dp.savefig(f"{figures}{save_name}_HTSA_scvi_mrkrs.pdf")

In [None]:
tec_cells = ['cTECIII', 'cTECII', 'cTECI', 'mcTEC', 'mcTEC-Prolif', 'mTECI','mTECII',
             'mTECIII', 'mTECI-trans','TEC-neuro','TEC-cilliated','TEC-myo', 'TEC-tuft']

In [None]:
population_plots(adata_scvi = adata_tec_scvi, mrkrs_dict = tec_mrkrs, save_name = "TEC", 
                 figures = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure_4_nat/figures_vk8/', 
                 cat_order = tec_cells,
                 anno = 'cell_type_level_4_explore', anno_cov = ['study', 'sex', 'age_group', 'chemistry_simple'])