# Thymus human spatial atlas, fine annotation B cells


In [1]:
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import scanpy as sc
import scvi

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to show output from all the lines in a cells
pd.set_option('display.max_column',None) # display all the columns in pandas
pd.options.display.max_rows = 100

from datetime import date
today = str(date.today())

import matplotlib
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42
sc.settings.set_figure_params(dpi = 150, color_map = 'RdPu', dpi_save = 300, vector_friendly = True, format = 'pdf')

In [3]:
model_folder = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/scvi_models/'
figures = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/figures/'
results = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/annotation_done/'

In [4]:
major_mrkrs = ['PTPRC', 'CD3G', 
               'HOXA9', 
               'HES1', #DN
               'RORC', #DP
               'CCR9',
              'PDCD1', #CD8aa
              'CD8A', 'CD8B', 
              'CD4', 'CD40LG', 
              'CRTAM', 'ANXA1', #memory
             'FOXP3', 'DHRS3', 
               'TRDC', #gama-delta 
              'EOMES', 'KLRD1', # NK cells
              'TNFSF11', # ILC
              'CD19', 'VPREB1', # pre-pro B cells
              'IGHD', 'IGHA1', 'IGHG1', 
              'CLEC9A', 'CLEC10A', 
              'LAMP3', 
             'LILRA4','S100A9', 'C1QA', 'TPSB2', 'ITGA2B', 'GYPA', 
              'PDGFRA', 'COLEC11', 'FBN1', 
              'RGS5', 'CDH5', 
               'LYVE1', 'EPCAM', 'PSMB11', 'DLK2', 'KRT14', 
               'CCL19', 'AIRE', 'KRT1', 'POU2F3',
               'MYOD1', 'NEUROG1']

In [13]:
ig_genes = ["IGHM","IGHD", "IGHG1", "IGHG2", "IGHG3", "IGHG4",
                                  "IGHA1", "IGHA2","IGHE"]

In [14]:
B_mrkrs = {'B_cells': ["CD79A", "TCL1A"],
'B_IFN': ["MX1","IFI44L", "STAT1"], 
'B_naive': ["FCER2", "BANK1", "FCMR"], 
'B_activated': ['CD69','FOS','FOSB','DUSP1','CD83'], 
'B_preGC': ["MIR155HG", "HIVEP3", "PARVB"],
'B_GC': ["GMDS", "LMO2", "LPP", "BCL6", "SUGCT", "AICDA", "H2AFZ", "MKI67"], 
'B_prePB': ["FRZB", "BTNL9", "HOPX"], 
'B_plasma': ["XBP1", "PRDM1", "FKBP11"], 
'B_mem': ["TNFRSF13B", "FCRL4", "CLECL1"]}

# fine annotate every lineage

In [5]:
h5ad_file = '/nfs/team205/ny1/ThymusSpatialAtlas/Figure1/annotation_done/HTSA_v17.h5ad'
adata_full = sc.read_h5ad(h5ad_file)
# adata_full.write_h5ad(h5ad_file)

In [15]:
Bcells = ['B(P)', 'B(Q)', 'B-pro/pre', 'B-stimulated', 'plasma-cells']

In [16]:
adata_full.obs['annotation_level_3'].cat.categories

In [17]:
adata = adata_full[adata_full.obs['annotation_level_3'].isin(Bcells),:].copy()

In [18]:
adata.obsm["X_scVI_broad"] = adata.obsm["X_scVI"].copy()
sc.pp.neighbors(adata, use_rep="X_scVI_broad")
sc.tl.umap(adata)
sc.pl.umap(adata, color = ['annotation_level_2', 'annotation_level_3', 'doublet_score'], legend_loc = "on data", legend_fontsize = 5)
sc.pl.umap(adata, color = ['age_group','study', 'chemistry_simple', 'donor',], legend_loc = "on data", legend_fontsize = 5)

In [19]:
def get_cluster_proportions(adata,
                            cluster_key="cluster_final",
                            sample_key="replicate",
                            drop_values=None):
    """
    Input
    =====
    adata : AnnData object
    cluster_key : key of `adata.obs` storing cluster info
    sample_key : key of `adata.obs` storing sample/replicate info
    drop_values : list/iterable of possible values of `sample_key` that you don't want
    
    Returns
    =======
    pd.DataFrame with samples as the index and clusters as the columns and 0-100 floats
    as values
    """
    
    adata_tmp = adata.copy()
    sizes = adata_tmp.obs.groupby([cluster_key, sample_key]).size()
    props = sizes.groupby(level=1).apply(lambda x: 100 * x / x.sum()).reset_index() 
    props = props.pivot(columns=sample_key, index=cluster_key).T
    props.index = props.index.droplevel(0)
    props.fillna(0, inplace=True)
    
    if drop_values is not None:
        for drop_value in drop_values:
            props.drop(drop_value, axis=0, inplace=True)
    return props


def plot_cluster_proportions(cluster_props, 
                             cluster_palette=None,
                             xlabel_rotation=1): 
    fig, ax = plt.subplots(dpi=300)
    fig.patch.set_facecolor("white")
    
    cmap = None
    if cluster_palette is not None:
        cmap = sns.palettes.blend_palette(
            cluster_palette, 
            n_colors=len(cluster_palette), 
            as_cmap=True)
   
    cluster_props.plot(
        kind="bar", 
        stacked=True, 
        ax=ax, 
        legend=None, 
        colormap=cmap,
        grid=False
    )
    
    ax.legend(bbox_to_anchor=(1.01, 1), frameon=False, title="Cluster")
    # sns.despine(fig, ax)
    # ax.tick_params(axis="x", rotation=xlabel_rotation)
    # ax.set_xlabel(cluster_props.index.name.capitalize())
    ax.set_ylabel("Proportion")
    # fig.tight_layout()
    
    return fig

In [20]:
norm_prop = pd.DataFrame(adata.obs['study'].value_counts()/adata.obs['study'].value_counts().sum()).T

In [21]:
counts = adata.obs['study'].value_counts()

In [22]:
plt.figure(figsize=(5, 1.5))
counts.plot( kind="barh", 
        stacked=True) 
for i, v in enumerate(counts):
    plt.text(v, i, " "+str(v), color='black', va='center')


In [23]:
plt.figure(figsize=(1,4))

bottom = 0
for item, count in counts.items():
    plt.bar('Bcells cells', count, bottom=bottom, label=item)
    bottom += count
plt.grid(False)
plt.ylabel('Studies')
plt.legend(loc = "lower right", fancybox = False)

In [24]:
adata.obs['age_group2'] = adata.obs['age_group'].apply(lambda x: 'prenatal' if x == 'fetal' else
                                                      'postnatal')
len(adata.obs['sample'].value_counts().index)
sum(adata.obs['sample'].value_counts()>5)

filt_samples = adata.obs['sample'].value_counts().index[adata.obs['sample'].value_counts()>5]

adata.shape
adata = adata[adata.obs['sample'].isin(filt_samples),:].copy()
adata.shape

In [25]:
def run_scvi2(adata, batch_hv="age_group", batch_scvi="sample", \
             cat_cov_scvi=["DonorID", "10X_version", "Sex", "Age_group"], cont_cov_scvi=["percent_mito"], \
             include_genes=[], exclude_cc_genes=True, exclude_mt_genes=True, 
             exclude_vdjgenes = True, max_epochs = 350, vae_name="", **kwargs):
    adata_scvi = sc.AnnData(X=adata.layers['counts'].copy(), obs=adata.obs.copy(), var=adata.var.copy())
    adata_scvi = adata_scvi[:,0:-150] # remove cite genes
    # adata_scvi.layers["counts"] = adata_scvi.X.copy()
    # sc.pp.normalize_total(adata_scvi, target_sum=1e4)
    # sc.pp.log1p(adata_scvi)
    # keep full dimension safe
    #    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=10000, layer="counts",\
    #                                batch_key="Age_group")
    sc.pp.highly_variable_genes(adata_scvi, flavor="seurat_v3", n_top_genes=3000, batch_key=batch_hv)
    selected_genes = list(set(adata_scvi.var.loc[adata_scvi.var['highly_variable']].index.tolist() + include_genes))
    print(len(selected_genes))
    if exclude_cc_genes:
        cell_cycle_genes = [x.strip() for x in open('/nfs/team205/vk8/processed_data/regev_lab_cell_cycle_genes.txt')]
        [selected_genes.remove(i) for i in cell_cycle_genes if i in selected_genes]
    if exclude_mt_genes:
        mt_genes = adata.var_names[adata.var_names.str.startswith('MT-')]
        [selected_genes.remove(i) for i in mt_genes if i in selected_genes]
    if exclude_vdjgenes:
        import re
        [selected_genes.remove(i) for i in selected_genes if re.search('^TR[AB][VDJ]|^IG[HKL][VDJC]', i)]
    
    adata_scvi = adata_scvi[:, selected_genes].copy()
    print(adata_scvi.shape)
    scvi.model.SCVI.setup_anndata(adata_scvi, batch_key=batch_scvi,
                                  categorical_covariate_keys=cat_cov_scvi,
                                  continuous_covariate_keys=cont_cov_scvi)
    scvi_kwargs = {k: v for k, v in kwargs.items() if k in scvi.model.SCVI.__init__.__code__.co_varnames}
    vae = scvi.model.SCVI(adata_scvi, **scvi_kwargs)
   # vae = scvi.model.SCVI(adata_scvi,n_layers=2, n_latent=30)
    train_kwargs = {k: v for k, v in kwargs.items() if k in vae.train.__code__.co_varnames}
    vae.train(**train_kwargs)
    # adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    #   adata_raw_scvi = adata.copy()
    adata_scvi.obsm["X_scVI"] = vae.get_latent_representation()
    sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
    sc.tl.leiden(adata_scvi)
    #   sc.tl.umap(adata_scvi)
    sc.tl.umap(adata_scvi)

    adata_raw_scvi = adata.copy()
    adata_raw_scvi.obsm['X_scVI'] = adata_scvi.obsm['X_scVI'].copy()
    adata_raw_scvi.obsm['X_umap'] = adata_scvi.obsm['X_umap'].copy()
    adata_raw_scvi.obsp = adata_scvi.obsp.copy()
    adata_raw_scvi.uns = adata_scvi.uns.copy()
    adata_raw_scvi.obs['leiden'] = adata_scvi.obs['leiden'].copy()

    sc.pl.umap(
        adata_raw_scvi,
        color=['annotation_level_2', 'annotation_level_3', 'doublet_score'],
        frameon=False,
        ncols=2,
    )
    sc.pl.umap(
        adata_raw_scvi,
        color=['age_group','study', 'chemistry_simple', 'donor'])
    cat_cov_str = "-".join(cat_cov_scvi)
    #    vae.save(f"/nfs/team205/vk8/scripts/scvi/Results/scvi_models/{vae_name}_scvi_batch{batch_scvi}_cat{cat_cov_str}_cont{cont_cov_scvi}_10000hvgenes_{today}", save_anndata = True)
    results = {}
    results['data'] = adata_raw_scvi
    results['vae'] = vae
    return (results)

In [26]:
adata_Bcells_scvi_run = run_scvi2(adata,  batch_hv="age_group2", batch_scvi="donor", 
          cat_cov_scvi=['chemistry_simple','age_group2','study'], cont_cov_scvi = None, include_genes=[], 
          exclude_cc_genes=True, exclude_vdjgenes=True, exclude_mt_genes = True, 
          n_layers=2, n_latent=30, max_epochs=350, batch_size=2000)

In [27]:
adata_scvi = adata_Bcells_scvi_run['data'].copy()

In [28]:
sc.tl.leiden(adata_scvi, resolution = 1.4, key_added = "leiden_r1.4")

sc.tl.leiden(adata_scvi, resolution = 1.8, key_added = "leiden_r1.8")

sc.tl.leiden(adata_scvi, resolution = 2.2, key_added = "leiden_r2.2")

In [29]:
sc.pl.umap(adata_scvi, color = ['doublet_score', 'leiden',"leiden_r1.4", "leiden_r1.8", "leiden_r2.2", 'annotation_level_3'], 
          legend_loc = "on data", legend_fontsize = 6)

In [35]:
B_mrkrs

In [36]:
sc.pl.umap(adata_scvi, color = ['CD79A', 'TCL1A','TNFRSF13B', 'FCRL4', 'CLECL1', 
                                'CD69', 'FOS', 'FOSB', 'DUSP1', 'CD83',
                               'XBP1', 'PRDM1', 'FKBP11'])

In [37]:
sc.pl.umap(adata_scvi, color = ig_genes)

In [38]:
sc.pl.umap(adata_scvi, color = major_mrkrs, 
          legend_loc = "on data", legend_fontsize = 6)

In [39]:
sc.pl.umap(adata_scvi, color = ["leiden_r1.8", "annotation_level_3", "doublet_score", "MKI67", "CD3E", "CD3G", "CD8A"], 
          legend_loc = "on data", legend_fontsize = 6, frameon = False)

In [49]:
adata_scvi_dbrm1 = adata_scvi[~adata_scvi.obs["leiden_r1.8"].isin(['20', '21']),:].copy()

In [42]:
adata_scvi.shape
adata_scvi_dbrm1.shape

In [48]:
sc.pl.dotplot(adata_scvi, var_names = ["CD3E", "CD3G", "CD8A"], groupby = "leiden_r1.8")

In [50]:
sc.pl.umap(adata_scvi_dbrm1, color = ["leiden_r1.8", "annotation_level_3", "doublet_score", "MKI67", "CD3E", "CD3G", "CD8A"], 
          legend_loc = "on data", legend_fontsize = 6, frameon = False)

In [51]:
adata_Bcells_scvi_dbrm = run_scvi2(adata_scvi_dbrm1,  batch_hv="age_group2", batch_scvi="donor", 
          cat_cov_scvi=['chemistry_simple','age_group2','study'], cont_cov_scvi = None, include_genes=[], 
          exclude_cc_genes=True, exclude_vdjgenes=True, exclude_mt_genes = True, 
          n_layers=2, n_latent=30, max_epochs=350, batch_size=2000)

In [52]:
adata_scvi_dbrm = adata_Bcells_scvi_dbrm['data'].copy()

In [53]:
sc.pl.umap(adata_scvi_dbrm, color = ["leiden", "leiden_r1.8", "annotation_level_3", "doublet_score", "MKI67", "CD3E", "CD3G", "CD8A"], 
          legend_loc = "on data", legend_fontsize = 6, frameon = False)

In [57]:
sc.pl.umap(adata_scvi_dbrm, color = ["leiden_r1.8", "annotation_level_3", "MKI67"], 
          legend_loc = "on data", legend_fontsize = 6, frameon = False)

In [58]:
sc.pl.umap(adata_scvi_dbrm, color = ["VPREB3", "CD79A", "CD27", "TCL1A", 'CD69', \
                                "XBP1", "PRDM1", "FKBP11",  "TNFRSF13B", "FCRL4", "CLECL1", "MKI67"])

In [59]:
sc.pl.umap(adata_scvi_dbrm, color = ig_genes)

In [70]:
sc.tl.leiden(adata_scvi_dbrm, resolution = 0.8, key_added = "leiden_res0.8")
sc.tl.leiden(adata_scvi_dbrm, resolution = 1.8, key_added = "leiden_res1.8")
sc.tl.leiden(adata_scvi_dbrm, resolution = 1.4, key_added = "leiden_res1.4")

In [73]:
sc.tl.leiden(adata_scvi_dbrm, resolution = 0.4, restrict_to=["leiden_res0.8", ['6']],  key_added = "leiden_res0.8R")

In [74]:
sc.pl.umap(adata_scvi_dbrm, color = ["annotation_level_3", "leiden", "leiden_res0.8", "leiden_res0.8R", "leiden_res1.4", "leiden_res1.8"], legend_loc = "on data", ncols = 2, frameon = False)

In [75]:
import celltypist
from celltypist import models

In [76]:
models.download_models(force_update = True)

In [77]:
model = models.Model.load(model = 'Pan_Fetal_Human.pkl')

In [92]:
adata_scvi_dbrm_citerm = adata_scvi_dbrm[:, adata_scvi_dbrm.var['cite'] == False].copy()

In [102]:
adata_scvi_dbrm

In [117]:
adata_scvi_dbrm_citerm = sc.AnnData(X=adata_scvi_dbrm.layers['counts'].copy(), obs=adata_scvi_dbrm.obs.copy(), var=adata_scvi_dbrm.var.copy(),
                                   obsm = adata_scvi_dbrm.obsm, obsp = adata_scvi_dbrm.obsp, uns = adata_scvi_dbrm.uns)
adata_scvi_dbrm_citerm = adata_scvi_dbrm_citerm[:,0:-150]

In [121]:
adata_scvi_dbrm_citerm.X.sum(1)

In [122]:
sc.pp.normalize_total(adata_scvi_dbrm_citerm, target_sum = 1e4)

In [110]:
adata_scvi_dbrm_citerm.X.sum(1)
adata_scvi_dbrm.X.sum(1)

In [111]:
# Not run; predict cell identities using this loaded model.
#predictions = celltypist.annotate(adata_2000, model = model, majority_voting = True)
# Alternatively, just specify the model name (recommended as this ensures the model is intact every time it is loaded).
predictions = celltypist.annotate(adata_scvi_dbrm_citerm, model = 'Pan_Fetal_Human.pkl', majority_voting = True)

In [149]:
# Not run; predict cell identities using this loaded model.
#predictions = celltypist.annotate(adata_2000, model = model, majority_voting = True)
# Alternatively, just specify the model name (recommended as this ensures the model is intact every time it is loaded).
predictions2 = celltypist.annotate(adata_scvi_dbrm_citerm, model = 'Developing_Human_Thymus.pkl', majority_voting = True)

In [152]:
# Not run; predict cell identities using this loaded model.
#predictions = celltypist.annotate(adata_2000, model = model, majority_voting = True)
# Alternatively, just specify the model name (recommended as this ensures the model is intact every time it is loaded).
predictions3 = celltypist.annotate(adata_scvi_dbrm_citerm, model = 'Immune_All_High.pkl', majority_voting = True)

In [141]:
adata_scvi_dbrm_citerm = predictions.to_adata(prefix = "pan-fetal")

In [150]:
adata_scvi_dbrm_citerm = predictions2.to_adata(prefix = "fet-thymus-")

In [153]:
adata_scvi_dbrm_citerm = predictions3.to_adata(prefix = "pip-")

In [139]:
adata_scvi_dbrm.obs[['predicted_labels', 'over_clustering', 'majority_voting']] = predictions.predicted_labels

In [144]:
adata_scvi_dbrm_citerm.obs.columns

In [154]:
sc.pl.umap(adata_scvi_dbrm_citerm, color = ['pan-fetalmajority_voting', 'pan-fetalconf_score',
                                           'fet-thymus-majority_voting', 'fet-thymus-conf_score',
                                           'pip-majority_voting', 'pip-conf_score'],  
           legend_loc = "on data", legend_fontsize = 5, cmap = "jet", ncols = 2)

In [157]:
sc.pl.umap(adata_scvi_dbrm, color = ['annotation_level_3',"leiden_res0.8R", 'majority_voting'], frameon = False, legend_loc = "on data", 
          legend_fontsize = 7)

In [158]:
adata_scvi_dbrm.obs['majority_voting'].cat.categories

In [161]:
adata_scvi_dbrm.obs['annotation_level_3_new'] = adata_scvi_dbrm.obs.apply(lambda x:
                                                'pro_B' if x['majority_voting'] == 'PRO_B' else
                                                 'late_pro_B' if x['majority_voting'] == 'LATE_PRO_B' else
                                                'large_pre_B' if x['majority_voting'] == 'LARGE_PRE_B' else
                                                'small_pre_B' if x['majority_voting'] == 'SMALL_PRE_B' else
                                                'B-Prolif' if x['majority_voting'] ==  'CYCLING_B' else
                                                'B-naive' if x["leiden_res0.8R"] in ['2','4', '5', '9'] else
                                                'B-memory' if x["leiden_res0.8R"] in ['0', '1', '3', '7'] else
                                                'B-plasma' if x["leiden_res0.8R"]  == '10' else 'B-naive', axis = 1
                                                 )

In [181]:
adata_scvi_dbrm.obs['annotation_level_3_new'] = adata_scvi_dbrm.obs['annotation_level_3_new'].cat.reorder_categories(
['pro_B', 'late_pro_B', 'large_pre_B', 'small_pre_B', 
'B-naive', 'B-memory', 'B-Prolif','B-plasma']).copy()

In [169]:
adata_scvi_dbrm

In [182]:
sc.pl.umap(adata_scvi_dbrm, color = 'annotation_level_3_new', legend_loc = "on data", 
          legend_fontsize = 6)

In [189]:
B_mrkrs = {'B':['CD79A'],
    'pro_B': ['CD19', 'VPREB1', 'MME', 'CDC45', 'MKI67'], 
 'late_pro_B': ['CD27', 'RAG1', 'DNTT'],
 'pro->pre': ['CD24', 'TNFRSF17'],
 'pre->mature' :['MME', 'IDH2', 'SPIB'],
 'B_naive': ['IGHM', 'IGHD', 'MS4A1', 'TCL1A'],
 'B_memory':["IGHG1", "TNFRSF13B", "FCRL4", "CLECL1"],
 'B_plasma':["XBP1", "PRDM1", "JCHAIN"],
 'Prolif': ['MKI67']
}

In [194]:
dp_Bcells = sc.pl.dotplot(adata_scvi_dbrm, groupby = 'annotation_level_3_new', var_names = B_mrkrs, 
                      standard_scale = "var", return_fig = True)
dp_Bcells = dp_Bcells.add_totals().style(dot_edge_color='black', dot_edge_lw=0.5, cmap = "Reds")
dp_Bcells.savefig(f"{figures}Bcells_spatlas_scvi_dbrm_mrkrs.pdf")

In [191]:
adata_Bcells_scvi_dbrm['vae'].save(f'{model_folder}Bcells_spatlas_scvi_dbrm_{today}', save_anndata=True)

adata_scvi_dbrm.write(f'{results}Bcells_spatlas_scvi_dbrm_{today}.h5ad')

In [192]:
umap_ctype = sc.pl.umap(adata_scvi_dbrm, color = ['annotation_level_3_new'], legend_loc = "on data",
          legend_fontsize = 7, frameon = False, title = '', return_fig=True)
umap_ctype.savefig(f"{figures}Bcells_spatlas_scvi_dbrm_umap.pdf", bbox_inches='tight')

In [193]:
umap_cov = sc.pl.umap(adata_scvi_dbrm, color = ['age_group2', 'study', 'chemistry_simple', 'donor'], 
                      frameon = False, title = ['', '', '', ''], ncols = 2, return_fig=True)
umap_cov.savefig(f"{figures}Bcells_spatlas_scvi_dbrm_umap_covariates.pdf", bbox_inches='tight')