In [None]:
# Script for running BBKNN on subsetted T cells 
# By Louise Baldwin
# Takes annotated .h5ad as input

In [None]:
# Script for testing the batch correction of T cells using BBKNN
# By Louise Baldwin
# takes annotated h5ad as input.###################
# Set up
###################

# import packages
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import os
import bbknn as bb
# import scvelo as scv
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

import anndata
import joblib
import sys

from matplotlib import rcParams
from numpy import sin
from tqdm.auto import tqdm
# from plotnine import *
# from sctoolkit.specificity import get_gene_specificity_metricsv2
# from sctoolkit.sctransform import sctransform
# from sctoolkit.utils import run_spring, dotplot_spring, sort_by_correlation, plot_significance_dotplot
# from sctoolkit.revigo import revigo
# from sctoolkit.proportions import plot_proportion_barplot, dirichletreg, plot_proportion_barplot_with_ncells, get_proportions_per_channel

# # directories
os.chdir("/share/ScratchGeneral/loubal/projects/MSC/mouse-single-cell")
in_file = ("data/processed/annotated.h5ad")
results_file = ("data/processed/Subset_Tcells_BBKNN.h5ad")
figdir = ("outs/BBKNN_after_Tcellsubset/figures/")
tabdir = ("outs/BBKNN_after_Tcellsubset/tables/")
os.makedirs(figdir, exist_ok=True)
os.makedirs(tabdir, exist_ok=True)

# set parameters for scanpy
# verbosity: errors (0), warnings (1), info (2), hints (3), detailed traceback (4)
# change default figdir to desired figdir
sc.settings.verbosity = 3           
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
#sc.set_figure_params(facecolor='white', color_map="viridis")
#sc.settings.figdir='/share/ScratchGeneral/loubal/projects/MSC/mouse-single-cell/outs/QC/figures/'
sc.settings.figdir=figdir
# scv.set_figure_params(vector_friendly=False)

In [None]:
ad=sc.read(in_file)

In [None]:
ad

In [None]:
sc.pl.umap(ad, color="Cell_type")

In [None]:
l = ['CD4 T cell', 'CD8 T cell']
adata =ad[ad.obs.Cell_type.isin(l)]

In [None]:
sc.pp.log1p(adata)
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=10)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 0.8, key_added = "leiden_0.8")
sc.pl.umap(adata,color=['leiden_0.8'],legend_loc='on data')

In [None]:
sc.pl.umap(adata, color=['Cd8a','Cd4','Foxp3','Ncr1','Tigit', 'Pdcd1', 'Tcf7', 'Icos', 'Tox', 'Cd40lg', 'Sell', 'Ncam1', 'Cxcr5', 'Ifng', 'Gata3', 'Bcl6', 'Mki67', "leiden_0.8"],
 s=10, color_map='viridis',legend_loc='on data', legend_fontsize='small', wspace=0.3, save="_markers-beforeBBKNN.pdf")

In [None]:
sc.pl.umap(adata, frameon=False, color="batch", save="Tcells_before_BBKNN.pdf")

In [None]:
bb.bbknn(adata)

In [None]:
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 1.2, key_added = "leiden_1.2")
sc.pl.umap(adata, color=['batch'])

In [None]:
# in_file=("data/processed/Subset_Tcells_BBKNN.h5ad")
# adata=sc.read_h5ad(in_file)

In [None]:
sc.pl.umap(adata, color="batch", frameon=False, save="_batch_afterBBKNN.pdf")

In [None]:
sc.tl.leiden(adata, resolution = 0.8, key_added = "leiden_0.8")

In [None]:
sc.pl.umap(adata, color="leiden_0.8")

In [None]:
sc.tl.leiden(adata, resolution = 1, key_added = "leiden_1")

In [None]:
sc.pl.umap(adata, color="leiden_1", save="_leiden_1.png")

In [None]:
sc.pl.umap(adata, color=['Cd8a','Cd4','Foxp3','Ncr1','Tigit', 'Pdcd1', 'Tcf7', 'Icos', 'Tox', 'Cd40lg', 'Sell', 'Ncam1', 'Cxcr5', 'Ifng', 'Gata3', 'Bcl6', 'Mki67', "leiden_0.8", "leiden_1", "leiden_1.2"],
 s=10, color_map='viridis',legend_loc='on data', legend_fontsize='small', wspace=0.3, save="_markers_afterBBKNN.pdf")

In [None]:
sc.pl.umap(adata, color="ReactionID", frameon=False, save="_batch_AfterBBKNN")

In [None]:
# sc.tl.leiden(adata, resolution=0.8, key_added = "leiden_0.8")

In [None]:
#this seems a bit too much, and doesm't seperate the cd4 and cd8 positive cluster (cluster 10 at 1.2 res)
# sc.tl.leiden(adata, resolution=1.4, key_added = "leiden_1.4")

In [None]:
sc.pl.umap(adata, color=['Cd8a','Cd4','Foxp3','Ncr1','Tigit', 'Pdcd1', 'Tcf7', 'Icos', 'Tox', 'Cd40lg', 'Sell', 'Ncam1', 'Cxcr5', 'Ifng', 'Gata3', 'Bcl6', 'Mki67', 'leiden_1', "leiden_0.8", "leiden_1.2"],
 s=10, color_map='viridis',legend_loc='on data', legend_fontsize='small', wspace=0.3, save="_markers.png")

In [None]:
sc.pl.umap(adata, color="batch")

In [None]:
sc.pl.umap(adata, color="Tissue")

In [None]:
def cluster_small_multiples(adata, Tissue, size=60, frameon=False, legend_loc=None, **kwargs):
    tmp = adata.copy()
    for i,clust in enumerate(adata.obs[Tissue].cat.categories):
        tmp.obs[clust] = adata.obs[Tissue].isin([clust]).astype('category')
        tmp.uns[clust+'_colors'] = ['#d3d3d3', adata.uns[Tissue+'_colors'][i]]
    sc.pl.umap(tmp, groups=tmp.obs[clust].cat.categories[1:].values, color=adata.obs[Tissue].cat.categories.tolist(), size=5, ncols=2, frameon=False, legend_loc=legend_loc, **kwargs)


with rc_context({'figure.figsize': (3, 2.5)}):
  cluster_small_multiples(adata, 'Tissue')

In [None]:
adata.write(results_file)

In [None]:
# ## This is too much

# # ridge regression mixes the batches a bit better, and the cofounder key cell_type helps preserve known biological variance
# # how to pick the confounder key
# # lets pick 0.8 for now, because it splits up the cd8s
# bb.ridge_regression(adata, batch_key=['batch'], confounder_key="Cell_type")
# sc.pp.pca(adata)
# bb.bbknn(adata, batch_key='batch')
# sc.tl.umap(adata)
# sc.pl.umap(adata, color=['batch','leiden_0.8'])

In [None]:
# sc.pl.umap(adata, color="Cell_type")

In [None]:
#sc.pl.umap(adata, color=['leiden_0.8'], frameon=False, size=2)

In [None]:
# def cluster_small_multiples(adata, Tissue, size=60, frameon=False, legend_loc=None, **kwargs):
#     tmp = adata.copy()
#     for i,clust in enumerate(adata.obs[Tissue].cat.categories):
#         tmp.obs[clust] = adata.obs[Tissue].isin([clust]).astype('category')
#         tmp.uns[clust+'_colors'] = ['#d3d3d3', adata.uns[Tissue+'_colors'][i]]
#     sc.pl.umap(tmp, groups=tmp.obs[clust].cat.categories[1:].values, color=adata.obs[Tissue].cat.categories.tolist(), size=5, ncols=2, frameon=False, legend_loc=legend_loc, **kwargs)


# with rc_context({'figure.figsize': (3, 2.5)}):
#   cluster_small_multiples(adata, 'Tissue')

In [None]:
adata=sc.read_h5ad(results_file)

In [None]:
######## Now for the markers
adata.uns['log1p']["base"] = None
sc.tl.rank_genes_groups(adata, groupby='leiden_1', key_added='rank_genes_leiden_1')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
sc.pl.umap(adata, color='leiden_1', frameon=False, save="_leiden_1.png")

In [None]:
results = adata.uns['rank_genes_leiden_1']
results['names'].dtype.names

out = np.array([[0,0,0,0,0]])
for group in results['names'].dtype.names:
    out = np.vstack((out, np.vstack((results['names'][group],
                                     results['scores'][group],
                                     results['pvals_adj'][group],
                                     results['logfoldchanges'][group],
                                     np.array([group] * len(results['names'][group])).astype('object'))).T))

In [None]:
pd.DataFrame(out).to_csv(tabdir+"allDEGS_1.csv")

In [None]:
markers = pd.DataFrame(out[1:], columns = ['Gene', 'scores', 'pval_adj', 'lfc', 'cluster'])
markers = sc.get.rank_genes_groups_df(adata, None)
markers = markers[(markers.pvals_adj < 0.05) & (markers.logfoldchanges > .5)]
markers

markers.to_csv(tabdir+'markers_1.csv')