In [1]:
import scanpy as sc
import numpy as np
import pandas as pd

## Loading the data

In [2]:
adata = sc.read_h5ad("data/Cell_by_cCRE/matrix.h5ad", backed="r")

In [3]:
display(adata.X)
display(adata.obs)
display(adata.var)

<HDF5 sparse dataset: format 'csr', shape (1323041, 1154611), type '<i8'>

LungMap_D122_1+AAACTACCAGCTGCGCTTATCC
LungMap_D122_1+AACTGCGCCATCCACTTGGATA
LungMap_D122_1+AACTTCTGCTCACCTGTAAGAC
LungMap_D122_1+AATTCGGATGAGATCTGTGACG
LungMap_D122_1+AATTCGGATGGTCCGGTCCAAA
...
spleen_sample_57_1+TTGGTTAACCCTTCAGGCCATTGGCCAGGTCCTCGTCATA
spleen_sample_57_1+TTGGTTGGTACGTAGCCGTAGATAGCCGATTTGCTCGATT
spleen_sample_57_1+TTGGTTGGTACTAAGAGTTATACCTTAGCTACCAGTTATT
spleen_sample_57_1+TTGGTTGGTAGCATTAGGCGCCGGTCCTAATTGCTCGATT
thymus_sample_2_1+TAGCATTGATCTGGCAGCGGTTCTGGCGCAACTTAAGATA


chr1:9955-10355
chr1:29163-29563
chr1:79215-79615
chr1:102755-103155
chr1:180580-180980
...
chrY:56676947-56677347
chrY:56677442-56677842
chrY:56678029-56678429
chrY:56678600-56679000
chrY:56707025-56707425


In [4]:
cell_metadata = pd.read_csv('data/Cell_metadata.tsv.gz', sep='\t', compression='gzip')
# Rename cellID to barcodes, to be consistant with adata.obs
cell_metadata.rename(columns={"cellID": "barcodes"}, inplace=True)

cell_metadata['tissue_full'] = cell_metadata['tissue']

# Get tissue names: need some mannual work
cell_metadata['tissue'] = cell_metadata['tissue'].str.rsplit('_', n=1).str[0]
# Remove '_sample'
cell_metadata['tissue'] = cell_metadata['tissue'].str.replace('_sample', '', regex=False)
# Remove '_CARE' followed by any digit
cell_metadata['tissue'] = cell_metadata['tissue'].str.replace('_CARE.*', '', regex=True)
# Remove 'Map'
cell_metadata['tissue'] = cell_metadata['tissue'].str.replace('Map', '', regex=False)
# Change to lower 
cell_metadata['tissue'] = cell_metadata['tissue'].str.lower()


In [5]:
adata.obs = adata.obs.merge(cell_metadata, on='barcodes', how='left')
adata.obs.index = adata.obs.barcodes.astype(str)
adata.obs = adata.obs.drop(columns=['barcodes'])

# change the dtype of columns 
for col in adata.obs.columns:
    if adata.obs[col].dtype != 'float64':
        adata.obs[col] = adata.obs[col].astype('category')
adata.obs

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0_level_0,logUMI,tsse,tissue,cell type,Life stage,tissue_full
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LungMap_D122_1+AAACTACCAGCTGCGCTTATCC,3.396374,16.133163,lung,Cilliated Cell,Adult,LungMap_D122
LungMap_D122_1+AACTGCGCCATCCACTTGGATA,3.008174,21.010101,lung,Cilliated Cell,Adult,LungMap_D122
LungMap_D122_1+AACTTCTGCTCACCTGTAAGAC,3.201670,10.760668,lung,Cilliated Cell,Adult,LungMap_D122
LungMap_D122_1+AATTCGGATGAGATCTGTGACG,3.236789,13.146853,lung,Cilliated Cell,Adult,LungMap_D122
LungMap_D122_1+AATTCGGATGGTCCGGTCCAAA,3.119915,10.211706,lung,Cilliated Cell,Adult,LungMap_D122
...,...,...,...,...,...,...
spleen_sample_57_1+TTGGTTAACCCTTCAGGCCATTGGCCAGGTCCTCGTCATA,3.134177,13.333333,spleen,Fetal Fibroblast (Splenic),Fetal,spleen_sample_57
spleen_sample_57_1+TTGGTTGGTACGTAGCCGTAGATAGCCGATTTGCTCGATT,3.485721,14.117647,spleen,Fetal Fibroblast (Splenic),Fetal,spleen_sample_57
spleen_sample_57_1+TTGGTTGGTACTAAGAGTTATACCTTAGCTACCAGTTATT,4.363687,8.497675,spleen,Fetal Fibroblast (Splenic),Fetal,spleen_sample_57
spleen_sample_57_1+TTGGTTGGTAGCATTAGGCGCCGGTCCTAATTGCTCGATT,3.509471,17.259552,spleen,Fetal Fibroblast (Splenic),Fetal,spleen_sample_57


In [6]:
cCRE_hg38 = pd.read_csv('data/cCRE_hg38.tsv.gz', sep='\t', compression='gzip')
# Add a 'Feature_ID' column so that we can add it to the adata.var
cCRE_hg38['Feature_ID'] = cCRE_hg38['#Chromosome'].astype(str) + ':' + cCRE_hg38['hg38_Start'].astype(str) + '-' + cCRE_hg38['hg38_End'].astype(str)
cCRE_hg38

Unnamed: 0,#Chromosome,hg38_Start,hg38_End,Class,Present in fetal tissues,Present in adult tissues,CRE module,Feature_ID
0,chr1,9955,10355,Promoter Proximal,yes,yes,146,chr1:9955-10355
1,chr1,29163,29563,Promoter,yes,yes,37,chr1:29163-29563
2,chr1,79215,79615,Distal,no,yes,75,chr1:79215-79615
3,chr1,102755,103155,Distal,no,yes,51,chr1:102755-103155
4,chr1,115530,115930,Distal,yes,no,36,chr1:115530-115930
...,...,...,...,...,...,...,...,...
1154606,chrY,56865180,56865580,Distal,no,yes,11,chrY:56865180-56865580
1154607,chrY,56869537,56869937,Distal,no,yes,147,chrY:56869537-56869937
1154608,chrY,56870780,56871180,Distal,yes,yes,147,chrY:56870780-56871180
1154609,chrY,56871319,56871719,Distal,no,yes,147,chrY:56871319-56871719


In [7]:
adata.var = adata.var.merge(cCRE_hg38, on='Feature_ID', how='left')
# also change the index
adata.var.index = adata.var.Feature_ID.astype(str)
adata.var = adata.var.drop(columns = "Feature_ID")
adata.var

AnnData expects .var.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


Unnamed: 0_level_0,#Chromosome,hg38_Start,hg38_End,Class,Present in fetal tissues,Present in adult tissues,CRE module
Feature_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
chr1:9955-10355,chr1,9955,10355,Promoter Proximal,yes,yes,146
chr1:29163-29563,chr1,29163,29563,Promoter,yes,yes,37
chr1:79215-79615,chr1,79215,79615,Distal,no,yes,75
chr1:102755-103155,chr1,102755,103155,Distal,no,yes,51
chr1:180580-180980,chr1,180580,180980,Promoter Proximal,no,yes,146
...,...,...,...,...,...,...,...
chrY:56676947-56677347,chrY,56676947,56677347,Distal,yes,no,37
chrY:56677442-56677842,chrY,56677442,56677842,Distal,yes,no,37
chrY:56678029-56678429,chrY,56678029,56678429,Distal,yes,no,37
chrY:56678600-56679000,chrY,56678600,56679000,Distal,yes,no,37


In [9]:
bdata = adata[adata.obs["Life stage"] == "Adult"]
bdata=bdata.to_memory()

In [13]:
# Step 1: filter cCREs that are in the adult tissue
bdata = bdata[:,bdata.var["Present in adult tissues"] == "yes"]
bdata=bdata.to_memory()
bdata

AnnData object with n_obs × n_vars = 615998 × 890130
    obs: 'logUMI', 'tsse', 'tissue', 'cell type', 'Life stage', 'tissue_full'
    var: '#Chromosome', 'hg38_Start', 'hg38_End', 'Class', 'Present in fetal tissues', 'Present in adult tissues', 'CRE module'

In [14]:
# Step 2: filter by n_count and n_cCRE
# No filtering yet, just add some n_count and n_cCRE information
sc.pp.filter_cells(bdata, min_counts=0)
sc.pp.filter_cells(bdata, min_genes=0)

In [None]:
bdata.obs.rename(columns = {"n_genes" : "n_cCRE"}, inplace = True)
bdata.obs.quantile([0, .25, .5, .75, 1])

In [None]:
# See how many can we filter out 
print("Original number cells", bdata.n_obs)
bdata_copy = bdata[bdata.obs['n_counts'] >= 500]
print("After filtering low counts cells", bdata_copy.n_obs)
bdata_copy = bdata_copy[bdata_copy.obs['n_cCRE'] >= 500]
print("After filtering low cCRE cells", bdata_copy.n_obs)