# Fetal immune atlas - BBKNN Integration

In this notebook we perform integration of datasets from different organs

In [27]:
import os,sys
import numpy as np 
import pandas as pd
import scanpy as sc
import scanpy.external as sce
import anndata
from bbknn import bbknn

### Load merged dataset 

In [7]:
merged_raw = sc.read_h5ad('/nfs/team205/ed6/data/Fetal_immune/Pan_fetal/working_data/PAN.A01.v01.entire_data_normalised_log.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [14]:
merged_raw.obs['batch'] = [x+y for x,y in zip(merged_raw.obs['organ'],merged_raw.obs['method'])]
merged_raw.obs['bbk'] = [x+y for x,y in zip(merged_raw.obs['donor'],merged_raw.obs['method'])]

In [41]:
set(merged_raw.obs['batch'])

{'BM3GEX',
 'BM5GEX',
 'KI3GEX',
 'KI5GEX',
 'LI3GEX',
 'LI5GEX',
 'SK3GEX',
 'SK5GEX',
 'SP3GEX',
 'SP5GEX',
 'TH(pharyn)3GEX',
 'TH3GEX',
 'TH5GEX',
 'YS3GEX',
 'YS5GEX'}

### Dataset subsetting with Geometric sketching 
To speed up the integration from a large number of cells we will use the geometric sketching approach introduced by [Hie et al.(2019)](https://www.cell.com/cell-systems/fulltext/S2405-4712(19)30152-8). Briefly, this is a strategy to subset a large single-cell dataset by sampling not uniformly across the whole dataset, but sampling in a _plaid covering_ of the dataset, which approximates the geometry of the given single-cell data as a union of equal-sized boxes. As dimensions for plaid covering we use diffusion components and PCs.

In [22]:
## Load cell cycle genes
cwd = '/nfs/team205/ed6/data/Fetal_immune/Pan_fetal/JP_archive/18_Pan_fetal/scjp'
sys.path.append(cwd)
from scjp.genes import cc_genes

In [42]:
from geosketch import gs

def remove_geneset(adata,geneset):
    adata = adata[:,~adata.var_names.isin(list(geneset))].copy()
    return adata

def get_subset(idata, select, cc_genes=cc_genes, log=False,raw=True):
    if raw:
        adata = sc.AnnData(idata[select].raw.X)
        adata.var = idata.raw.var
    else:
        adata = sc.AnnData(idata[select].X)
        adata.var = idata.var
    adata.obs = idata.obs[select]
    adata.raw = adata.copy()
    #adata.X = scipy.sparse.csr_matrix(np.exp(adata.X.todense())-1)
    sc.pp.filter_genes_dispersion(adata,log=log)
    if log:
        sc.pp.log1p(adata)
    sc.pp.scale(adata,max_value=10)
    if len(cc_genes)>0:
        print('removing cc_genes...')
        adata = remove_geneset(adata,cc_genes)
    sc.pp.pca(adata,n_comps = np.min([50,adata.X.shape[0],adata.X.shape[1]]))
    return adata

def get_sketch(adata,key,folds=10,how='pd',min_num_per_key=500,start='filter',raw=True):
    '''
    geometric sketching based on diffusion map and pca
    - key: key for the batch information
    - folds: folds to subsample
    - min_num_per_key: minimun number to sample
    '''
    sketch_index = []
    for smp in set(adata.obs[key]):
        print(smp)
        c = adata.obs[key] == smp
        print('from:',sum(c))
        
        if start=='filter':
            sdata = get_subset(adata,c,raw=raw)
        else:        
            sdata = adata[c]
            sc.pp.filter_genes_dispersion(sdata)
            sc.pp.pca(sdata)
        
        if 'd' in how:
            sc.pp.neighbors(sdata)
            sc.tl.diffmap(sdata)

        N = np.max([np.int(np.sum(c)/folds),np.min([min_num_per_key,np.sum(c)])])
        print('to select:',N)
        if how =='pd':
            set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'],N,replace=False)])
            set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:,:50],N,replace=False)])
            sketch_index.extend(list(set1.union(set2)))
        elif how =='p':
            set2 = set(sdata.obs_names[gs(sdata.obsm['X_pca'][:,:50],N,replace=False)])
            sketch_index.extend(list(set2))
        elif how =='d':
            set1 = set(sdata.obs_names[gs(sdata.obsm['X_diffmap'][:,:20],N,replace=False)])
            sketch_index.extend(list(set1))
        else:
            raise SystemError
        print('length of sketch:',len(sketch_index))

    return(sketch_index)

In [43]:
sketch_index = get_sketch(merged_raw,'batch',folds=5, how='pd', raw=False, min_num_per_key=10000)

TH(pharyn)3GEX
from: 4926


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


to select: 4926
length of sketch: 4926
YS3GEX
from: 17799


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/rp_tree.py", line 135:[0m
[1m@numba.njit(fastmath=True, nogil=True, parallel=True)
[1mdef euclidean_random_projection_split(data, indices, rng_state):
[0m[1m^[0m[0m
[0m
  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/utils.py", line 409:[0m
[1m@numba.njit(parallel=Tr

to select: 10000
length of sketch: 18081
SK5GEX
from: 112963


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 22592
length of sketch: 55306
BM5GEX
from: 61666


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 12333
length of sketch: 75947
LI3GEX
from: 152341


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 30468
length of sketch: 127451
SP3GEX
from: 51171


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10234
length of sketch: 143956
TH3GEX
from: 54925


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10985
length of sketch: 161928
KI5GEX
from: 4356


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


to select: 4356
length of sketch: 166284
SK3GEX
from: 75592


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 15118
length of sketch: 190797
YS5GEX
from: 41230


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10000
length of sketch: 206537
BM3GEX
from: 40621


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10000
length of sketch: 222477
LI5GEX
from: 68571


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 13714
length of sketch: 245601
KI3GEX
from: 22999


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10000
length of sketch: 260572
SP5GEX
from: 85718


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 17143
length of sketch: 288774
TH5GEX
from: 54664


  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


removing cc_genes...


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../../../home/jovyan/.local/lib/python3.7/site-packages/umap/nndescent.py", line 47:[0m
[1m    @numba.njit(parallel=True)
[1m    def nn_descent(
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


to select: 10932
length of sketch: 307113


In [44]:
sketch_adata = merged_raw[sketch_index].copy()

  if not is_categorical(df_full[k]):
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


### Add cell type labels

Made uniform in `notebooks/20201230_UniformCellLabels.ipynb`

In [51]:
annot_dir = '/home/jovyan/mount/gdrive/Pan_fetal/annotations/'
annot_df = pd.read_csv(annot_dir + "uniform_labels_full.csv", index_col=0)

In [59]:
sketch_adata.obs_names[~sketch_adata.obs_names.isin(annot_df.index)]

Index(['FCAImmP7316898-AGAGCGAAGTGCCAGA', 'FCAImmP7316899-TCAACGATCTGCTGCT',
       'FCAImmP7316898-TCATTACAGACAAAGG', 'FCAImmP7316898-TGGACGCTCCGCGTTT',
       'FCAImmP7316899-CATATTCTCCTGTAGA', 'FCAImmP7316899-CTCTACGGTTAAAGAC',
       'FCAImmP7316898-TTTGCGCTCTGTACGA', 'FCAImmP7316899-CTTAACTGTTACGCGC',
       'FCAImmP7316899-CAGCTGGGTAAATGTG', 'FCAImmP7316899-CGGACACCAGTGGGAT',
       ...
       'FCAImmP7851890-TCGCGAGCAGGGTTAG', 'FCAImmP7292034-GCAAACTCACATTAGC',
       'FCAImmP7851896-ACTGCTCGTTCCGTCT', 'FCAImmP7579230-GCACATAAGCAAATCA',
       'FCAImmP7851897-TGTTCCGTCCAAATGC', 'FCAImmP7851897-AACTCTTGTCTTCGTC',
       'FCAImmP7851897-AAATGCCGTGTTCGAT', 'FCAImmP7851896-TTCTTAGGTCCATCCT',
       'FCAImmP7851895-CAACTAGGTGAGGGTT', 'FCAImmP7851897-TGCTGCTGTATCTGCA'],
      dtype='object', length=276092)

In [65]:
len([x for x in merged_raw.obs_names if 'FCA' in x])

849542

In [61]:
[x for x in annot_df.index if "FCAImm" in x]

662971

In [53]:
annot_df.loc[sketch_adata.obs_names]

KeyError: "Passing list-likes to .loc or [] with any missing labels is no longer supported. The following labels were missing: Index(['FCAImmP7316898-AGAGCGAAGTGCCAGA', 'FCAImmP7316899-TCAACGATCTGCTGCT',\n       'FCAImmP7316898-TCATTACAGACAAAGG', 'FCAImmP7316898-TGGACGCTCCGCGTTT',\n       'FCAImmP7316899-CATATTCTCCTGTAGA',\n       ...\n       'FCAImmP7851897-AACTCTTGTCTTCGTC', 'FCAImmP7851897-AAATGCCGTGTTCGAT',\n       'FCAImmP7851896-TTCTTAGGTCCATCCT', 'FCAImmP7851895-CAACTAGGTGAGGGTT',\n       'FCAImmP7851897-TGCTGCTGTATCTGCA'],\n      dtype='object', length=276092). See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike"

### Integration on sketch

### Projection of full data

Source code from [`geosketch/bin/integration.py`](https://github.com/brianhie/geosketch/blob/master/bin/integration.py)

In [None]:
from geosketch import gs, uniform, srs, kmeanspp
import numpy as np
from scanorama import transform
from scipy.sparse import csr_matrix, find
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from subprocess import Popen
import sys
from time import time

from utils import log, mkdir_p

def integrate_sketch(datasets_dimred, integration_fn, integration_fn_args={},
                     sampling_type='geosketch', N=10000):

    if sampling_type == 'geosketch':
        from geosketch import gs
        sampling_fn = gs
    else:
        from geosketch import uniform
        sampling_fn = uniform

    # Sketch each dataset.

    sketch_idxs = [
        sorted(set(sampling_fn(X, N, replace=False)))
        for X in datasets_dimred
    ]
    datasets_sketch = [ X[idx] for X, idx in zip(datasets_dimred, sketch_idxs) ]

    # Integrate the dataset sketches.

    datasets_int = integration_fn(datasets_sketch[:], **integration_fn_args)

    # Apply integrated coordinates back to full data.

    labels = []
    curr_label = 0
    for i, a in enumerate(datasets_sketch):
        labels += list(np.zeros(a.shape[0]) + curr_label)
        curr_label += 1
    labels = np.array(labels, dtype=int)

    for i, (X_dimred, X_sketch) in enumerate(zip(datasets_dimred, datasets_sketch)):
        X_int = datasets_int[i]

        neigh = NearestNeighbors(n_neighbors=3).fit(X_dimred)
        _, neigh_idx = neigh.kneighbors(X_sketch)

        ds_idxs, ref_idxs = [], []
        for ref_idx in range(neigh_idx.shape[0]):
            for k_idx in range(neigh_idx.shape[1]):
                ds_idxs.append(neigh_idx[ref_idx, k_idx])
                ref_idxs.append(ref_idx)

        bias = transform(X_dimred, X_int, ds_idxs, ref_idxs, 15, batch_size=1000)

        datasets_int[i] = X_dimred + bias

    return datasets_int
