In [1]:
import scanpy
import pandas as pd 
import numpy as np 

In [1]:
epithelial_data = scanpy.read('../BCSC_data/Epithelial-atlas.h5ad')

# Count non zero values percentages
print("Non zero percentages:", epithelial_data.X.count_nonzero() / (epithelial_data.X.shape[0] * epithelial_data.X.shape[1]))

Non zero percentages: 0.06908140801872932


In [7]:
from src.utils.cytotrace import CytoTRACE

cytotrace_anndata = scanpy.AnnData(epithelial_data.X.T)
cytotrace_anndata.var['annotation'] = list(epithelial_data.obs['author_cell_type'].values)
cytotrace = CytoTRACE(cytotrace_anndata)
cytotrace.run(n_neighbors = 30, n_pcs = 30)

computing neighbors
    finished (0:00:30) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
or is corrupted (e.g. due to subsetting). Consider recomputing with `pp.neighbors`.
computing moments based on connectivities
    finished (0:11:20) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


CytoTRACE result on Epithelial data

In [27]:
counts = cytotrace.get_counts(threshold=0.000001)
for count in counts:
    print(count, ":", counts[count])

LummHR-major : 2360
LummHR-SCGB : 1147
LummHR-active : 482
Lumsec-major : 473
Lumsec-basal : 2007
Lumsec-myo : 51
Lumsec-prol : 8
Lumsec-KIT : 35
Lumsec-HLA : 154
basal : 613
Lumsec-lac : 8


In [30]:
cytotrace_anndata.obs['ct_score'].values.mean()

0.0283968

In [2]:
from src.utils.origins import ORIGINS

epithelial_data = scanpy.read('../BCSC_data/Epithelial-atlas.h5ad')
epithelial_data_origins = scanpy.AnnData(epithelial_data.X)
epithelial_data_origins.var['features'] = list(epithelial_data.var['feature_name'].values)
epithelial_data_origins.obs['annotation'] = list(epithelial_data.obs['author_cell_type'].values)

origins = ORIGINS(anndata = epithelial_data_origins, path = "origins_ppi.csv", show_progress = True)
result_epithelial = origins.run()

Filtering genes...: 100%|██████████| 3386/3386 [00:00<00:00, 4203.10it/s]
Building mask matrix...: 100%|██████████| 3374/3374 [00:35<00:00, 96.04it/s] 
Computing ORIGINS...: 100%|██████████| 240804/240804 [02:21<00:00, 1700.22it/s]


Time elapsed: 181.53738498687744 seconds


ORIGINS result on Epithelial data

In [4]:
counts = origins.get_counts(threshold=0.1)
for count in counts:
    print(count, ":", counts[count])

LummHR-major : 174
LummHR-SCGB : 154
LummHR-active : 20
Lumsec-basal : 280
Lumsec-major : 197
Lumsec-myo : 5
Lumsec-KIT : 493
Lumsec-prol : 104
basal : 311
Lumsec-lac : 368
Lumsec-HLA : 2


Bone Marrow data

In [7]:
bm_data = scanpy.read('../BCSC_data/20492a4b-0def-457b-9574-60dfdde2a0f2/BM_standard_design.h5ad')

# Count non zero values percentages
print("Non zero percentages:", bm_data.X.count_nonzero() / (bm_data.X.shape[0] * bm_data.X.shape[1]))

Non zero percentages: 0.036576874845375966


Filter cells with metadata (select only those with annotation)
And show how many of each cell there are

In [8]:
from collections import Counter

# Read metadata
bm_metadata = pd.read_excel('../BCSC_data/BM_metadata.xlsx', sheet_name=2)

# We have to do some extra programming to match the string in the data with the metadata (for some awful reason
# they are different)
names = [s[6:-6] for s in bm_metadata['Cell'].values]
name_dict = {}

# Make a string that matches
for name in names:
    name = name.split('-')
    name = name[0][:4] + name[0][-1:] + "-" + name[1]
    name_dict[name] = name

name_idx = [(name, index) for name, index in zip(bm_data.obs.index, range(len(bm_data.obs.index)))]
name_list = []
idx_list = []

# Match the strings
for name, idx in name_idx:

    try:
        name_list.append(name_dict[name])
        idx_list.append(idx)
    except:
        pass


count = Counter(bm_data.obs.loc[name_list]['anno'].values)

# Count the cells with correct annotation
for cell in count:
    print(cell,":", count[cell])

  warn(msg)


NK cells : 4630
CD8+ naive T cells : 5378
T helper cells : 11069
HSCs : 4136
CD4+ naive T cells : 5089
cDCs : 4601
Pre-B cells : 2668
CD14+ monocytes : 20563
Neutrophil progenitors : 2527
Erythroid cells : 8739
Pro-B cells : 2336
Memory B cells : 3268
ANK1-low erythroid cells : 1643
Cytotoxic T cells : 7869
Plasma cells : 1902
pDCs : 2432
Naive B cells : 994
Erythroid progenitors : 2316
CD16+ monocytes : 3293
Megakaryocyte progenitors : 578
MSCs : 185


In [12]:
from src.utils.cytotrace import CytoTRACE

cytotrace_anndata = scanpy.AnnData(bm_data.X[idx_list,:].T)
cytotrace_anndata.var['annotation'] = list(bm_data.obs.loc[name_list]['anno'].values)
cytotrace = CytoTRACE(cytotrace_anndata)
cytotrace.run(n_neighbors = 30, n_pcs = 30)

computing neighbors
    finished (0:00:28) --> added 
    'distances' and 'connectivities', weighted adjacency matrices (adata.obsp)
computing moments based on connectivities
    finished (0:01:18) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)


CytoTRACE result on Bone Marrow data

In [13]:
counts = cytotrace.get_counts(threshold=0.000001)
for count in counts:
    print(count, ":", counts[count])

T helper cells : 448
HSCs : 46
CD4+ naive T cells : 233
CD8+ naive T cells : 91
Memory B cells : 114
Cytotoxic T cells : 274
Naive B cells : 39
ANK1-low erythroid cells : 51
CD14+ monocytes : 385
Pre-B cells : 115
Erythroid cells : 139
Pro-B cells : 57
Neutrophil progenitors : 54
Erythroid progenitors : 26
pDCs : 44
Megakaryocyte progenitors : 15
cDCs : 48
NK cells : 205
CD16+ monocytes : 76
Plasma cells : 53
MSCs : 6


In [12]:
from src.utils.origins import ORIGINS

bm_data = scanpy.read('../BCSC_data/20492a4b-0def-457b-9574-60dfdde2a0f2/BM_standard_design.h5ad')
bm_data_origins = scanpy.AnnData(bm_data.X[idx_list,:])
bm_data_origins.var['features'] = list(pd.read_excel('../BCSC_data/BM_metadata.xlsx', sheet_name=0)['Symbol'].values) + ["-"] * 1500
bm_data_origins.obs['annotation'] = list(bm_data.obs.loc[name_list]['anno'].values)

origins = ORIGINS(anndata = bm_data_origins, path = "origins_ppi.csv", show_progress = True)
result_bm = origins.run()

Filtering genes...: 100%|██████████| 3386/3386 [00:00<00:00, 4583.94it/s]
Building mask matrix...: 100%|██████████| 3024/3024 [00:33<00:00, 90.65it/s]
Computing ORIGINS...: 100%|██████████| 96216/96216 [00:48<00:00, 1992.61it/s]


Time elapsed: 82.95261836051941 seconds


ORIGINS result on Bone Marrow data

In [21]:
counts = origins.get_counts(threshold=0.05)
for count in counts:
    print(count, ":", counts[count])

NK cells : 1299
T helper cells : 4622
CD4+ naive T cells : 2785
CD14+ monocytes : 6568
Neutrophil progenitors : 266
Memory B cells : 1454
Cytotoxic T cells : 3064
CD8+ naive T cells : 2689
Naive B cells : 542
Plasma cells : 742
Pre-B cells : 268
ANK1-low erythroid cells : 80
Pro-B cells : 231
CD16+ monocytes : 590
pDCs : 220
Erythroid cells : 36
Megakaryocyte progenitors : 26
cDCs : 134
HSCs : 29
Erythroid progenitors : 13
MSCs : 8
