In [1]:
import scanpy as sc
import numpy as np
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt
from tqdm import tqdm

#### link: https://figshare.com/ndownloader/files/39375713

In [2]:
condition_key = 'status_control'
covariate_key = 'zone'

control_name = 'Control'
stim_name = 'Infected'

# Read and preprocess data

In [3]:
data_path = "../../original_datasets/Liver/adata_infected.h5ad"

In [4]:
adata = sc.read_h5ad(data_path)
adata.X = adata.layers['counts'].copy()

In [5]:
print(adata.X.max(), adata.X.min())

10738.414 0.0


In [6]:
adata.raw = adata.copy()
adata.uns = {}

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)

adata.shape, adata.X.max(), adata.X.min()

((19053, 8203), 8.367348, 0.0)

# Create cov_cond column

In [7]:
adata.obs['cov_cond'] = adata.obs[covariate_key].astype(str) + '_' + adata.obs[condition_key].astype(str)

In [8]:
adata.obs['cov_cond']

AACCTTG_IMM-feb20_M10_P29_24hpi    Pericentral_Infected
AAGACTC_IMM-feb20_M10_P29_24hpi     Periportal_Infected
AAGGCTA_IMM-feb20_M10_P29_24hpi     Periportal_Infected
AATGCCG_IMM-feb20_M10_P29_24hpi    Pericentral_Infected
ACAATCG_IMM-feb20_M10_P29_24hpi     Periportal_Infected
                                           ...         
TTCCTGA_IMM-Jan22_M24_P24_NI         Periportal_Control
TTGCACC_IMM-Jan22_M24_P24_NI         Periportal_Control
TTGCAGA_IMM-Jan22_M24_P24_NI         Periportal_Control
TTGCCTA_IMM-Jan22_M24_P24_NI         Periportal_Control
TTGGTCA_IMM-Jan22_M24_P24_NI         Periportal_Control
Name: cov_cond, Length: 19053, dtype: object

# Compute DEGs

In [9]:
status_control_degs = pt.compute_degs(
    adata, 
    cov_key=covariate_key,
    cond_key=condition_key, 
    stim_name=stim_name, 
    control_name=control_name,
    condition_names=[stim_name, control_name],
    synergy=False,
    method='wilcoxon'
    )

adata.uns[f'rank_genes_groups_{condition_key}'] = {
    stim_name :status_control_degs,
}



In [10]:
infected_degs = pt.compute_degs(
    adata, 
    cov_key=covariate_key,
    cond_key='infected', 
    stim_name='TRUE', 
    control_name='FALSE',
    condition_names=['TRUE', 'FALSE'],
    synergy=False,
    method='wilcoxon'
)

adata.uns['rank_genes_groups_infected'] = {
    'TRUE' :infected_degs,
}

In [11]:
coarse_time_list = adata.obs['coarse_time'].tolist()
status_list = adata.obs['status_control'].tolist()

coarse_status = [status_list[i] + '_' + coarse_time_list[i] for i in range(len(status_list))]
coarse_status = [x.replace('Control_Control', 'Control') for x in coarse_status]

adata.obs['coarse_status'] = coarse_status
adata.obs['coarse_status'] = adata.obs['coarse_status'].astype('category')

In [12]:
rank_genes_groups_hpi = {}

for condition in tqdm(['Infected_2 hpi', 'Infected_12 hpi', 'Infected_24 hpi', 'Infected_30 hpi', 'Infected_36 hpi']):
    hpi_degs = pt.compute_degs(
        adata, 
        cov_key='zone',
        cond_key='coarse_status', 
        stim_name=condition, 
        control_name='Control',
        condition_names=[condition, 'Control'],
        synergy=False,
        method='wilcoxon'
    )
    rank_genes_groups_hpi[condition] = hpi_degs

rank_genes_groups_hpi = {k.replace('Infected_', ''):v for k,v in rank_genes_groups_hpi.items()}
adata.uns['rank_genes_groups_hpi'] = rank_genes_groups_hpi

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.97s/it]


In [13]:
rank_genes_groups_hpi_specific = {}
for condition in tqdm(['Infected_2 hpi', 'Infected_12 hpi', 'Infected_24 hpi', 'Infected_30 hpi', 'Infected_36 hpi']):
    hpi_specific_degs = pt.compute_degs(
        adata, 
        cov_key='zone',
        cond_key='coarse_status', 
        stim_name=condition, 
        control_name='Control',
        condition_names=['Control', 'Infected_2 hpi', 'Infected_12 hpi', 'Infected_24 hpi', 'Infected_30 hpi', 'Infected_36 hpi'],
        synergy=True,
        method='wilcoxon'
    )
    rank_genes_groups_hpi_specific[condition] = hpi_specific_degs
    
rank_genes_groups_hpi_specific = {k.replace('Infected_', ''):v for k,v in rank_genes_groups_hpi_specific.items()}
adata.uns['rank_genes_groups_hpi_specific'] = rank_genes_groups_hpi_specific

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:33<00:00,  6.72s/it]


# Create split column

In [14]:
adata_split = pt.create_split_cols(
    adata=adata, 
    cov_key="zone", 
    cond_key="status_control", 
    stim_name="Infected",
    random_state=42
)

In [15]:
adata_split = pt.create_split_cols(
    adata=adata_split, 
    cov_key="zone", 
    cond_key="infected", 
    stim_name="TRUE",
    random_state=42
)

In [16]:
adata.X.max(), adata.X.min()

(8.367348, 0.0)

In [17]:
adata.X = adata.layers['counts'].copy()

In [18]:
adata.X.max(), adata.X.min()

(10738.414, 0.0)

# Create cell ids

In [19]:
adata_split.obs['sc_cell_ids'] = list(range(adata.shape[0])) 

In [20]:
adata_split.write_h5ad('../../preprocessed_datasets/liver.h5ad')

In [21]:
adata_split

AnnData object with n_obs × n_vars = 19053 × 8203
    obs: 'barcode', 'mouse', 'marker', 'time', 'infected', 'experiment', 'coarse_time', 'MB', 'eta', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'mt_qc', 'SUMPBA', 'n_counts', 'n_genes', 'mus_rRNA', 'pba_rRNA', 'pba_rRNA_fraction', 'nCount_PBA', 'nFeature_PBA', 'nCount_MUS', 'nFeature_MUS', 'RNA_snn_res.0.2', 'seurat_clusters', 'MBinfected', 'cluster_names', 'ident', 'eta_normalized', 'nCounts_tot', 'normalized_PBA', 'coarse_time_orig', 'zone', 'status', 'status_control', 'time_int', 'split_random', 'cov_cond', 'coarse_status', 'split_Infected_Pericentral', 'split_Infected_Periportal', 'split_TRUE_Pericentral', 'split_TRUE_Periportal', 'sc_cell_ids'
    var: 'org', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'n_counts'
    uns: 'log1p', 'rank_genes_groups_status_control', 'rank_genes_groups_infected', 'rank_genes_groups_hpi', 'rank_genes_groups_hpi_specific'
    obsm: '

# Sanity checks

In [22]:
adata_split.X.max(), adata_split.X.min()

(10738.414, 0.0)

In [23]:
import pandas as pd

In [24]:
pd.crosstab(adata_split.obs['split_Infected_Periportal'], adata_split.obs['zone'])

zone,Pericentral,Periportal
split_Infected_Periportal,Unnamed: 1_level_1,Unnamed: 2_level_1
ood,0,2555
train,4988,9860
val,596,1054


In [25]:
pd.crosstab(adata_split.obs['split_Infected_Periportal'], adata_split.obs['status_control'])

status_control,Control,Infected,Uninfected
split_Infected_Periportal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ood,0,2555,0
train,2884,1399,10565
val,321,155,1174


In [26]:
adata.obs['infected']

AACCTTG_IMM-feb20_M10_P29_24hpi     TRUE
AAGACTC_IMM-feb20_M10_P29_24hpi     TRUE
AAGGCTA_IMM-feb20_M10_P29_24hpi     TRUE
AATGCCG_IMM-feb20_M10_P29_24hpi     TRUE
ACAATCG_IMM-feb20_M10_P29_24hpi     TRUE
                                   ...  
TTCCTGA_IMM-Jan22_M24_P24_NI       FALSE
TTGCACC_IMM-Jan22_M24_P24_NI       FALSE
TTGCAGA_IMM-Jan22_M24_P24_NI       FALSE
TTGCCTA_IMM-Jan22_M24_P24_NI       FALSE
TTGGTCA_IMM-Jan22_M24_P24_NI       FALSE
Name: infected, Length: 19053, dtype: category
Categories (2, object): ['FALSE', 'TRUE']