# Steps

In [1]:
# 1) No leak in split columns
# 2) It's count data
# 3) contains sc_cell_ids (unique = data.shape[0])
# 4) All cell types are present in DEGs
# 5) Split counts (in percentage)

In [2]:
import scanpy as sc

In [13]:
def data_pp_smoke_test(
    data_name,
    stim_name,
    covariate_key,
    condition_key,
    control_name,
):
    adata = sc.read_h5ad(
        f'../../Datasets/preprocessed_datasets/{data_name}.h5ad'
    )

    # 1) Count data
    print('Sanity check counts', adata.X.max(), adata.X.min())

    # 2) No leak in split columns
    unique_covs = list(adata.obs[covariate_key].unique())

    for cov in unique_covs:
        adata_subset = adata[
            (adata.obs[condition_key] == stim_name) & 
            (adata.obs[covariate_key] == cov)
        ]
        
        adata_ood = adata[adata.obs[f'split_{stim_name}_{cov}'] == 'ood']
        adata_train_val = adata[adata.obs[f'split_{stim_name}_{cov}'] != 'ood']
        
        n_ood = adata_subset.shape[0]
        assert n_ood > 0
        assert n_ood == adata_ood.shape[0]

        # Split counts
        print(f'Split counts {cov}', (adata.obs[f'split_{stim_name}_{cov}'].value_counts() / adata_train_val.shape[0]))

    # sc_cell_ids sanity checks
    assert len(adata.obs['sc_cell_ids'].unique()) == adata.shape[0]
    assert adata.obs['sc_cell_ids'].tolist() == list(range(adata.shape[0]))

    # Are all cell types present in rank_genes_groups ? 
    
    rank_genes_groups = sorted(
        list(adata.uns[f'rank_genes_groups_{condition_key}'][stim_name].keys())
    )
    rank_genes_groups.remove('all')
    assert rank_genes_groups == sorted(unique_covs)

    print('All tests succeeded')

In [14]:
data_pp_smoke_test(
    data_name='kang',
    stim_name='stimulated',
    covariate_key='cell_type',
    condition_key='condition',
    control_name='control',
)

Sanity check counts 3828.0 0.0
Split counts CD14 Mono split_stimulated_CD14 Mono
train    0.899991
ood      0.187855
val      0.100009
Name: count, dtype: float64
Split counts CD4 T split_stimulated_CD4 T
train    0.899973
ood      0.217906
val      0.100027
Name: count, dtype: float64
Split counts T split_stimulated_T
train    0.899947
val      0.100053
ood      0.025145
Name: count, dtype: float64
Split counts CD8 T split_stimulated_CD8 T
train    0.899954
val      0.100046
ood      0.035230
Name: count, dtype: float64
Split counts B split_stimulated_B
train    0.899938
val      0.100062
ood      0.060459
Name: count, dtype: float64
Split counts DC split_stimulated_DC
train    0.899940
val      0.100060
ood      0.016016
Name: count, dtype: float64
Split counts CD16 Mono split_stimulated_CD16 Mono
train    0.899992
val      0.100008
ood      0.041184
Name: count, dtype: float64
Split counts NK split_stimulated_NK
train    0.899962
val      0.100038
ood      0.024217
Name: count, dtyp

In [15]:
data_pp_smoke_test(
    data_name='liver',
    stim_name='TRUE',
    covariate_key='zone',
    condition_key='infected',
    control_name='FALSE',
)

Sanity check counts 10738.414 0.0
Split counts Pericentral split_TRUE_Pericentral
train    0.899994
val      0.100006
ood      0.088805
Name: count, dtype: float64
Split counts Periportal split_TRUE_Periportal
train    0.899988
ood      0.154867
val      0.100012
Name: count, dtype: float64
All tests succeeded
