In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

# Read blood & heart original data

In [None]:
adata_heart = sc.read_h5ad('/data/Experiments/Benchmark/scdisentangle/Datasets/original_datasets/Myocarditis/GSE228597_combined_tissue_data.h5ad')
adata_blood = sc.read_h5ad('/data/Experiments/Benchmark/scdisentangle/Datasets/original_datasets/Myocarditis/GSE228597_combined_pbmc_data.h5ad')

adata_heart.X = adata_heart.raw.X.toarray().copy()
adata_blood.X = adata_blood.raw.X.toarray().copy()

# Rename donor

In [None]:
sample_ids = adata_blood.obs['sample_id'].tolist()
donors = [x.split('_')[0] + '_' + x.split('_')[1] if len(x.split('_')) > 1 else x for x in sample_ids]
adata_blood.obs['donor'] = donors

# Subset to CD8 and NK

In [None]:
adata_blood = adata_blood[adata_blood.obs['lineage'] == 'CD8 and NK']
adata_heart = adata_heart[adata_heart.obs['lineage_names'] == 'T and NK cells']

adata_blood.shape, adata_heart.shape

In [None]:
adata_blood.shape, adata_heart.shape

# Subset to myocarditis

In [None]:
adata_blood = adata_blood[adata_blood.obs['condition'] == 'myocarditis']
adata_heart = adata_heart[adata_heart.obs['condition'] == 'myocarditis']

adata_blood.shape, adata_heart.shape

# Rename T & NK subsets in heart

In [None]:
cd4_markers = ["IL7R", "CCR7", "LST1", "SELL"]
cd8_markers = ["CD8A", "CD8B", "GZMH", "PRF1", "NKG7"]
nk_markers = ["NCAM1", "KLRD1", "KLRB1", "FCGR3A", "NKG7"]

In [None]:
sc.tl.score_genes(adata_heart, cd4_markers, score_name="cd4_score")
sc.tl.score_genes(adata_heart, cd8_markers, score_name="cd8_score")
sc.tl.score_genes(adata_heart, nk_markers,  score_name="nk_score")

In [None]:
thresh = 0.05  # adjust by inspecting score histograms
scores = adata_heart.obs[["cd4_score", "cd8_score", "nk_score"]]
labels = scores.idxmax(axis=1)
labels[scores.max(axis=1) < thresh] = "ambiguous"
adata_heart.obs["t_nk_subtype"] = labels

# Subset to annotated CD8 & NK

In [None]:
#adata_heart = adata_heart[adata_heart.obs['t_nk_subtype'].isin(['cd8_score', 'nk_score'])]

In [None]:
#adata_heart.obs['t_nk_subtype'].value_counts()

# Subset blood to pre steroid

In [None]:
adata_blood = adata_blood[adata_blood.obs['timepoint_cat'].isin(['pre_steroid', 'post_steroid'])]

In [None]:
adata_blood.obs['on_steroids'] = ['False'] * adata_blood.shape[0]
adata_blood.obs['on_steroids'][adata_blood.obs['timepoint_cat'] == 'post_steroid'] = 'True'

In [None]:
adata_blood.obs['on_steroids'] = adata_blood.obs['on_steroids'].astype('category')

In [None]:
adata_blood.obs['on_steroids'].value_counts()

In [None]:
adata_blood = adata_blood[adata_blood.obs['on_steroids'] == 'False'].copy()
adata_heart = adata_heart[adata_heart.obs['on_steroids'] == 'False'].copy()

# Rename tissue

In [None]:
adata_blood.obs['tissue'] = ['Blood'] * adata_blood.shape[0]
adata_blood.obs['tissue'] = adata_blood.obs['tissue'].astype('category')

adata_heart.obs['tissue'] = ['Heart'] * adata_heart.shape[0]
adata_heart.obs['tissue'] = adata_heart.obs['tissue'].astype('category')

# Create cell type column

In [None]:
adata_heart.obs['cell_type'] = adata_heart.obs['t_nk_subtype'].copy()

adata_heart.obs['cell_type'] = adata_heart.obs['cell_type'].replace({
    'cd8_score': 'CD8',
    'nk_score': 'NK'
})

adata_heart.obs['cell_type'] = adata_heart.obs['cell_type'].astype('category')
adata_heart.obs['cell_type'].value_counts()

In [None]:
b_ctypes = adata_blood.obs['cluster_name'].tolist()
b_ctypes = [x.split(':')[0].replace('b-', '') for x in b_ctypes]

adata_blood.obs['cell_type'] = b_ctypes
adata_blood.obs['cell_type'] = adata_blood.obs['cell_type'].astype('category')

adata_blood.obs['cell_type'].value_counts()

# Subset obs columns

In [None]:
adata_blood.obs = adata_blood.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]
adata_heart.obs = adata_heart.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]

# Keep common genes

In [None]:
intersection_genes = set(adata_blood.var_names) & set(adata_heart.var_names)
adata_blood = adata_blood[:, list(intersection_genes)].copy()
adata_heart = adata_heart[:, list(intersection_genes)].copy()

In [None]:
assert adata_blood.var_names.tolist() == adata_heart.var_names.tolist()

# Keep common donors

In [None]:
pbmc_donors = np.unique(adata_blood.obs['donor'].tolist())
heart_donors = np.unique(adata_heart.obs['donor'].tolist())
common_donors = [x for x in pbmc_donors if x in heart_donors]

len(common_donors), len(pbmc_donors), len(heart_donors)

In [None]:
adata_blood = adata_blood[adata_blood.obs['donor'].isin(common_donors)]
adata_heart = adata_heart[adata_heart.obs['donor'].isin(common_donors)]

# Concatenate data

In [None]:
adata_cat = adata_heart.concatenate(adata_blood)

In [None]:
adata_cat

In [None]:
adata_cat.X.max(), adata_cat.X.min()

In [None]:
adata_cat.layers['counts'] = adata_cat.X.copy()

# Filter

In [None]:
adata_cat.uns = {}

sc.pp.filter_cells(adata_cat, min_counts=100)
sc.pp.filter_genes(adata_cat, min_counts=5)

# Normalize

In [None]:
sc.pp.normalize_total(adata_cat, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(adata_cat)

adata_cat.shape, adata_cat.X.max(), adata_cat.X.min()

In [None]:
adata_cat.obs['placeholder'] = ['1'] * adata_cat.shape[0]

In [None]:
adata_cat

# HVG

In [None]:
sc.pp.highly_variable_genes(adata_cat, n_top_genes=5000)
adata_cat = adata_cat[:, adata_cat.var['highly_variable']]

In [None]:
adata_cat.X = adata_cat.X.toarray()

In [None]:
adata_cat.X.max(), adata_cat.X.min()

In [None]:
import sys
sys.path.append('../')
import preprocessing_tools as pt

# DEGs

In [None]:
tissue_degs = pt.compute_degs(
    adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart', 
    control_name='Blood',
    condition_names=['Blood', 'Heart'],
    synergy=False
    )

adata_cat.uns['rank_genes_groups_tissue'] = {
    'Heart' :tissue_degs,
}

In [None]:
treatment_degs = pt.compute_degs(
    adata_cat, 
    cov_key='tissue', 
    cond_key='on_steroids', 
    stim_name='True', 
    control_name='False',
    condition_names=['True', 'False'],
    synergy=False
    )

adata_cat.uns['rank_genes_groups_on_steroids'] = {
    'True' :treatment_degs,
}

In [None]:
adata_cat.X = adata_cat.layers['counts'].copy()

In [None]:
adata_cat.obs['sc_cell_ids'] = list(range(adata_cat.shape[0]))

In [None]:
adata_split = pt.create_split_cols(
    adata=adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart',
    random_state=42
)

In [None]:
adata_split.write_h5ad('../../preprocessed_datasets/myocarditis_org_pre_steroid.h5ad')

In [None]:
import pandas as pd

In [None]:
pd.crosstab(adata_cat.obs['tissue'], adata_cat.obs['donor'])

In [None]:
adata_split.X.max()

In [None]:
adata_split.X

In [None]:
adata_split.obs['tissue'].value_counts()