In [1]:
import scanpy as sc
import numpy as np
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt

In [2]:
condition_key = 'condition'
covariate_key = 'cell_type'

control_name = 'control'
stim_name = 'stimulated'

##### Link: https://drive.google.com/drive/folders/1n1SLbXha4OH7j7zZ0zZAxrj_-2kczgl8

# Read and preprocess data

In [3]:
data_path = "../../original_datasets/Kang/kang_count.h5ad"

In [4]:
adata = sc.read_h5ad(data_path)

print(adata.X.max(), adata.X.min())

adata.layers['counts'] = adata.X.copy()
adata.raw = adata.copy()
adata.uns = {}

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)

adata.shape, adata.X.max(), adata.X.min()

3828.0 0.0


((13576, 13404), 9.887986, 0.0)

# Create cov_cond column

In [5]:
adata.obs['cov_cond'] = adata.obs[covariate_key].astype(str) + '_' + adata.obs[condition_key].astype(str)

In [6]:
adata.obs['cov_cond']

index
AAACATACATTTCC.1    CD14 Mono_control
AAACATACCAGAAA.1    CD14 Mono_control
AAACATACCTCGCT.1    CD14 Mono_control
AAACATACGATGAA.1        CD4 T_control
AAACATACGGCATT.1    CD14 Mono_control
                          ...        
TTTGCATGAACGAA.1        DC_stimulated
TTTGCATGACGTAC.1     CD4 T_stimulated
TTTGCATGCCTGTC.1         B_stimulated
TTTGCATGCTAAGC.1     CD4 T_stimulated
TTTGCATGGGACGA.1     CD4 T_stimulated
Name: cov_cond, Length: 13576, dtype: object

# Compute DEGs

In [7]:
stim_degs = pt.compute_degs(
    adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name, 
    control_name=control_name,
    condition_names=[control_name, stim_name],
    method='wilcoxon'
)



In [8]:
adata.uns[f'rank_genes_groups_{condition_key}'] = {
    stim_name :stim_degs,
}

In [9]:
adata.X.max(), adata.X.min()

(9.887986, 0.0)

In [10]:
adata.X = adata.layers['counts'].copy()

In [11]:
adata.X.max(), adata.X.min()

(3828.0, 0.0)

# Create split column

In [12]:
adata_split = pt.create_split_cols(
    adata=adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name,
    random_state=42
)

# Create cell ids

In [13]:
adata_split.obs['sc_cell_ids'] = list(range(adata.shape[0]))

# Write

In [14]:
adata_split.write_h5ad('../../preprocessed_datasets/kang.h5ad')

In [15]:
adata_split

AnnData object with n_obs × n_vars = 13576 × 13404
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'stim', 'seurat_annotations', 'integrated_snn_res.0.5', 'seurat_clusters', 'condition', 'cell_type', 'n_counts', 'cov_cond', 'split_stimulated_B', 'split_stimulated_CD14 Mono', 'split_stimulated_CD16 Mono', 'split_stimulated_CD4 T', 'split_stimulated_CD8 T', 'split_stimulated_DC', 'split_stimulated_NK', 'split_stimulated_T', 'sc_cell_ids'
    var: 'n_counts'
    uns: 'log1p', 'rank_genes_groups_condition'
    layers: 'counts'

# Sanity checks

In [16]:
adata_split.X.max(), adata_split.X.min()

(3828.0, 0.0)

In [19]:
adata_split.obs[f'split_{stim_name}_B'].value_counts()

split_stimulated_B
train    11521
val       1281
ood        774
Name: count, dtype: int64

In [20]:
import pandas as pd

In [21]:
pd.crosstab(adata_split.obs[f'split_{stim_name}_B'], adata_split.obs[covariate_key])

cell_type,B,CD4 T,CD8 T,CD14 Mono,CD16 Mono,DC,NK,T
split_stimulated_B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ood,774,0,0,0,0,0,0,0
train,542,3805,718,3950,944,418,570,574
val,50,461,96,412,100,54,49,59


In [22]:
pd.crosstab(adata_split.obs[f'split_{stim_name}_B'], adata_split.obs[condition_key])

condition,control,stimulated
split_stimulated_B,Unnamed: 1_level_1,Unnamed: 2_level_1
ood,0,774
train,5723,5798
val,636,645
