In [1]:
import scanpy as sc
import numpy as np
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt

In [2]:
covariate_key = 'cell_type'

##### Link: 

# Read and preprocess data

In [3]:
data_path = "../../original_datasets/leukemia/3b68ca27-6888-42e1-bf75-b036a178c6db.h5ad"

In [4]:
adata = sc.read_h5ad(data_path)

adata.X = adata.raw.X.copy()
del adata.raw

adata.obs['cell_type'] = adata.obs['author_cell_type']

print(adata.X.max(), adata.X.min())

adata.layers['counts'] = adata.X.copy()
adata.raw = adata.copy()
adata.uns = {}

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)

adata.shape, adata.X.max(), adata.X.min()

1757.0 0.0




((37100, 16566), 8.923758, 0.0)

In [5]:
sc.pp.highly_variable_genes(
    adata, 
    n_top_genes=5000, 
    subset=True
)

In [6]:
adata.X = adata.layers['counts'].copy()

In [7]:
adata.X.max(), adata.X.min()

(1757.0, 0.0)

# Create cell ids

In [8]:
adata.obs['sc_cell_ids'] = list(range(adata.shape[0]))

# Write

In [9]:
adata.write_h5ad('../../preprocessed_datasets/leukemia.h5ad')

In [10]:
adata

AnnData object with n_obs × n_vars = 37100 × 5000
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.ribo', 'percent.mito', 'Sample_id', 'Transduction', 'Phase', 'Timepoint', 'Condition', 'CARexpresion', 'cloneType', 'Frequency', 'author_cell_type', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'n_counts', 'sc_cell_ids'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'n_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    obsm: 'HARMONY', 'X_UMAP'
    layers: 'counts'

In [11]:
adata.obs['cell_type']

p1Ipos_AAACCTGAGACAGACC-1    Late prolif: histones enriched MKI67+ T cells
p1Ipos_AAACCTGGTACTTCTT-1            Early prolif: MCM3/5/7+ PCNA+ T cells
p1Ipos_AAACCTGTCAATACCG-1                        CD8+ Eff/Mem T cells (EM)
p1Ipos_AAACCTGTCTTTAGGG-1                        CD8+ Effector T cells (E)
p1Ipos_AAACGGGAGTGGAGAA-1     CD4+ Central/Effector memory T cells (CM/EM)
                                                 ...                      
p5Pneg_TTTGGTTAGCTGCCCA-1     CD4+ Central/Effector memory T cells (CM/EM)
p5Pneg_TTTGGTTCACATGACT-1                           CD8+ Cytotoxic T cells
p5Pneg_TTTGGTTTCGGCATCG-1                           CD8+ Cytotoxic T cells
p5Pneg_TTTGTCAAGACCTAGG-1                           CD8+ Cytotoxic T cells
p5Pneg_TTTGTCAAGTCAATAG-1                               CD4+ Naive T cells
Name: cell_type, Length: 37100, dtype: category
Categories (11, object): ['CD4+ Central/Effector memory T cells (CM/EM)', 'CD4+ Naive T cells', 'CD8+ Cytotoxic T cells', 'CD8+

In [12]:
adata.obs['Sample_id']

p1Ipos_AAACCTGAGACAGACC-1      patient1_IP
p1Ipos_AAACCTGGTACTTCTT-1      patient1_IP
p1Ipos_AAACCTGTCAATACCG-1      patient1_IP
p1Ipos_AAACCTGTCTTTAGGG-1      patient1_IP
p1Ipos_AAACGGGAGTGGAGAA-1      patient1_IP
                                 ...      
p5Pneg_TTTGGTTAGCTGCCCA-1    patient5_Peak
p5Pneg_TTTGGTTCACATGACT-1    patient5_Peak
p5Pneg_TTTGGTTTCGGCATCG-1    patient5_Peak
p5Pneg_TTTGTCAAGACCTAGG-1    patient5_Peak
p5Pneg_TTTGTCAAGTCAATAG-1    patient5_Peak
Name: Sample_id, Length: 37100, dtype: category
Categories (10, object): ['patient1_IP', 'patient1_Peak', 'patient2_IP', 'patient2_Peak', ..., 'patient4_IP', 'patient4_Peak', 'patient5_IP', 'patient5_Peak']