In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import scipy as sp
from scipy import sparse, io
import re

Data was downloaded from GEO (accession numbers: GSM3943045 and GSM3980128). Data comes as a dge matrix with raw counts, no annotations. Annotations are available from [this page](https://figshare.com/articles/dataset/HCL_DGE_Data/7235471). h5ad files are also available there but because I'm only interested in the 2 bone marrow samples I didn't want to download the file with everything in it. 

In [2]:
# read in raw counts
adata1 = sc.read('../data/han/GSM3943045_Adult-Bone-Marrow1_dge.txt').T
adata2 = sc.read('../data/han/GSM3980128_Adult-Bone-Marrow2_dge.txt').T
adata = adata1.concatenate([adata2], batch_key='sample_ID', batch_categories=['BoneMarrow_1', 'BoneMarrow_2'])
del adata1, adata2
adata

AnnData object with n_obs × n_vars = 20000 × 15999
    obs: 'sample_ID'

In [3]:
# read in metadata
meta = pd.read_excel('../data/han/HCL_Fig1_cell_Info.xlsx')
meta = meta[meta['sample'] == 'AdultBoneMarrow']
meta

Unnamed: 0,cellnames,sample,cluster,stage,batch,donor,celltype
25181,BoneMarrow_1.CGGCAGCCAGACAACGCC,AdultBoneMarrow,14,Adult,AdultBoneMarrow1,Donor27,B cell (Plasmocyte)
25182,BoneMarrow_1.CGCACCTCGTAAAGTCGT,AdultBoneMarrow,3,Adult,AdultBoneMarrow1,Donor27,B cell (Plasmocyte)
25183,BoneMarrow_1.GTCCCGGGACATTCACTT,AdultBoneMarrow,3,Adult,AdultBoneMarrow1,Donor27,B cell (Plasmocyte)
25184,BoneMarrow_1.TGCGGAGCGTCCATGGCG,AdultBoneMarrow,3,Adult,AdultBoneMarrow1,Donor27,B cell (Plasmocyte)
25185,BoneMarrow_1.ACCTGACAAAGTCCTTTC,AdultBoneMarrow,14,Adult,AdultBoneMarrow1,Donor27,B cell (Plasmocyte)
...,...,...,...,...,...,...,...
33880,BoneMarrow_2.GGACATAACCTAAACGCC,AdultBoneMarrow,41,Adult,AdultBoneMarrow2,Donor28,Antigen presenting cell (RPS high)
33881,BoneMarrow_2.CCGCTAGGCTGCAGCGAG,AdultBoneMarrow,41,Adult,AdultBoneMarrow2,Donor28,Antigen presenting cell (RPS high)
33882,BoneMarrow_2.CCTAGAGTTGCCGCTCAA,AdultBoneMarrow,41,Adult,AdultBoneMarrow2,Donor28,Antigen presenting cell (RPS high)
33883,BoneMarrow_2.TTCCGCAGTCGTCACAAG,AdultBoneMarrow,41,Adult,AdultBoneMarrow2,Donor28,Antigen presenting cell (RPS high)


raw counts from GEO have exactly 10,000 cells in each sample but the metadata file contains much fewer cells so I assume the data on GEO has not been QC-ed and the metadata file just contains the cells that passed QC. I'll subset to just these cells.

In [4]:
# add metadata to adata object
p = re.compile('[^.]+$')
bc1 = [p.search(x).group() for x in meta['cellnames']]
p = re.compile('[^.]+')
bc2 = [p.search(x).group() for x in meta['cellnames']]
meta['barcode'] = np.char.add(np.char.add(np.array(bc1), np.array(['-'] * len(bc1))), np.array(bc2))
meta.index = meta['barcode']
adata.obs = pd.concat([adata.obs, meta], axis = 1)
del meta

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [None]:
# subset to only cells with metadata and visualise
adata = adata[~pd.isnull(adata.obs['celltype'])].copy()
adata.X = sparse.csr_matrix(adata.X).astype(dtype='float32')
adata.layers['counts'] = adata.X
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
sc.pp.pca(adata, n_comps=15)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.umap(adata, color=['sample', 'celltype'], ncols = 1)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../anaconda3/lib/python3.7/site-packages/umap/rp_tree.py", line 135:[0m
[1m@numba.njit(fastmath=True, nogil=True, parallel=True)
[1mdef euclidean_random_projection_split(data, indices, rng_state):
[0m[1m^[0m[0m
[0m
  self.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "../../../../anaconda3/lib/python3.7/site-packages/umap/utils.py", line 409:[0m
[1m@numba.njit(parallel=True)
[1mdef build_candidates(current_graph, n_vertices, n_neighbors, max_candidates, rng_state):
[0m[1m^[0m[0m
