In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

# Read blood & heart original data

In [2]:
adata_heart = sc.read_h5ad('/data/Experiments/Benchmark/scdisentangle/Datasets/original_datasets/Myocarditis/GSE228597_combined_tissue_data.h5ad')
adata_blood = sc.read_h5ad('/data/Experiments/Benchmark/scdisentangle/Datasets/original_datasets/Myocarditis/GSE228597_combined_pbmc_data.h5ad')

adata_heart.X = adata_heart.raw.X.toarray().copy()
adata_blood.X = adata_blood.raw.X.toarray().copy()

# Rename donor

In [3]:
sample_ids = adata_blood.obs['sample_id'].tolist()
donors = [x.split('_')[0] + '_' + x.split('_')[1] if len(x.split('_')) > 1 else x for x in sample_ids]
adata_blood.obs['donor'] = donors

# Subset to CD8 and NK

In [4]:
adata_blood = adata_blood[adata_blood.obs['lineage'] == 'CD8 and NK']
adata_heart = adata_heart[adata_heart.obs['lineage_names'] == 'T and NK cells']

adata_blood.shape, adata_heart.shape

((134325, 26425), (7781, 28034))

In [5]:
adata_blood.shape, adata_heart.shape

((134325, 26425), (7781, 28034))

# Subset to myocarditis

In [6]:
adata_blood = adata_blood[adata_blood.obs['condition'] == 'myocarditis']
adata_heart = adata_heart[adata_heart.obs['condition'] == 'myocarditis']

adata_blood.shape, adata_heart.shape

((83080, 26425), (4133, 28034))

# Rename T & NK subsets in heart

In [7]:
cd4_markers = ["IL7R", "CCR7", "LST1", "SELL"]
cd8_markers = ["CD8A", "CD8B", "GZMH", "PRF1", "NKG7"]
nk_markers = ["NCAM1", "KLRD1", "KLRB1", "FCGR3A", "NKG7"]

In [8]:
sc.tl.score_genes(adata_heart, cd4_markers, score_name="cd4_score")
sc.tl.score_genes(adata_heart, cd8_markers, score_name="cd8_score")
sc.tl.score_genes(adata_heart, nk_markers,  score_name="nk_score")

  adata.obs[score_name] = pd.Series(


In [9]:
thresh = 0.05  # adjust by inspecting score histograms
scores = adata_heart.obs[["cd4_score", "cd8_score", "nk_score"]]
labels = scores.idxmax(axis=1)
labels[scores.max(axis=1) < thresh] = "ambiguous"
adata_heart.obs["t_nk_subtype"] = labels

# Subset to annotated CD8 & NK

In [10]:
#adata_heart = adata_heart[adata_heart.obs['t_nk_subtype'].isin(['cd8_score', 'nk_score'])]

In [11]:
#adata_heart.obs['t_nk_subtype'].value_counts()

# Subset blood to pre steroid

In [12]:
adata_blood = adata_blood[adata_blood.obs['timepoint_cat'].isin(['pre_steroid', 'post_steroid'])]

In [13]:
adata_blood.obs['on_steroids'] = ['False'] * adata_blood.shape[0]
adata_blood.obs['on_steroids'][adata_blood.obs['timepoint_cat'] == 'post_steroid'] = 'True'

  adata_blood.obs['on_steroids'] = ['False'] * adata_blood.shape[0]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  adata_blood.obs['on_steroids'][adata_blood.obs['timepoint_cat'] == 'post_steroid'] = 'True'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:

In [14]:
adata_blood.obs['on_steroids'] = adata_blood.obs['on_steroids'].astype('category')

In [15]:
adata_blood.obs['on_steroids'].value_counts()

on_steroids
True     34690
False    33523
Name: count, dtype: int64

In [16]:
adata_blood = adata_blood[adata_blood.obs['on_steroids'] == 'False'].copy()
adata_heart = adata_heart[adata_heart.obs['on_steroids'] == 'False'].copy()

# Rename tissue

In [17]:
adata_blood.obs['tissue'] = ['Blood'] * adata_blood.shape[0]
adata_blood.obs['tissue'] = adata_blood.obs['tissue'].astype('category')

adata_heart.obs['tissue'] = ['Heart'] * adata_heart.shape[0]
adata_heart.obs['tissue'] = adata_heart.obs['tissue'].astype('category')

# Create cell type column

In [18]:
adata_heart.obs['cell_type'] = adata_heart.obs['t_nk_subtype'].copy()

adata_heart.obs['cell_type'] = adata_heart.obs['cell_type'].replace({
    'cd8_score': 'CD8',
    'nk_score': 'NK'
})

adata_heart.obs['cell_type'] = adata_heart.obs['cell_type'].astype('category')
adata_heart.obs['cell_type'].value_counts()

cell_type
CD8          1280
ambiguous     384
NK            378
cd4_score     266
Name: count, dtype: int64

In [19]:
b_ctypes = adata_blood.obs['cluster_name'].tolist()
b_ctypes = [x.split(':')[0].replace('b-', '') for x in b_ctypes]

adata_blood.obs['cell_type'] = b_ctypes
adata_blood.obs['cell_type'] = adata_blood.obs['cell_type'].astype('category')

adata_blood.obs['cell_type'].value_counts()

cell_type
CD8        21516
NK          8990
MAIT        1727
CD8T/NK     1290
Name: count, dtype: int64

# Subset obs columns

In [20]:
adata_blood.obs = adata_blood.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]
adata_heart.obs = adata_heart.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]

# Keep common genes

In [21]:
intersection_genes = set(adata_blood.var_names) & set(adata_heart.var_names)
adata_blood = adata_blood[:, list(intersection_genes)].copy()
adata_heart = adata_heart[:, list(intersection_genes)].copy()

In [22]:
assert adata_blood.var_names.tolist() == adata_heart.var_names.tolist()

# Keep common donors

In [23]:
pbmc_donors = np.unique(adata_blood.obs['donor'].tolist())
heart_donors = np.unique(adata_heart.obs['donor'].tolist())
common_donors = [x for x in pbmc_donors if x in heart_donors]

len(common_donors), len(pbmc_donors), len(heart_donors)

(9, 17, 11)

In [24]:
adata_blood = adata_blood[adata_blood.obs['donor'].isin(common_donors)]
adata_heart = adata_heart[adata_heart.obs['donor'].isin(common_donors)]

# Concatenate data

In [25]:
adata_cat = adata_heart.concatenate(adata_blood)

  adata_cat = adata_heart.concatenate(adata_blood)


In [26]:
adata_cat

AnnData object with n_obs × n_vars = 11299 × 25196
    obs: 'tissue', 'on_steroids', 'donor', 'cell_type', 'batch'
    var: 'n_cells-0', 'percent_cells-0', 'robust-0', 'highly_variable_features-0', 'n_cells-1', 'percent_cells-1', 'robust-1', 'highly_variable_features-1', 'featureid-1'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'

In [27]:
adata_cat.X.max(), adata_cat.X.min()

(931.0, 0.0)

In [28]:
adata_cat.layers['counts'] = adata_cat.X.copy()

# Filter

In [29]:
adata_cat.uns = {}

sc.pp.filter_cells(adata_cat, min_counts=100)
sc.pp.filter_genes(adata_cat, min_counts=5)

# Normalize

In [30]:
sc.pp.normalize_total(adata_cat, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(adata_cat)

adata_cat.shape, adata_cat.X.max(), adata_cat.X.min()

((11299, 14561), 7.8459435, 0.0)

In [31]:
adata_cat.obs['placeholder'] = ['1'] * adata_cat.shape[0]

In [32]:
adata_cat

AnnData object with n_obs × n_vars = 11299 × 14561
    obs: 'tissue', 'on_steroids', 'donor', 'cell_type', 'batch', 'n_counts', 'placeholder'
    var: 'n_cells-0', 'percent_cells-0', 'robust-0', 'highly_variable_features-0', 'n_cells-1', 'percent_cells-1', 'robust-1', 'highly_variable_features-1', 'featureid-1', 'n_counts'
    uns: 'log1p'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

# HVG

In [33]:
sc.pp.highly_variable_genes(adata_cat, n_top_genes=5000)
adata_cat = adata_cat[:, adata_cat.var['highly_variable']]

In [34]:
adata_cat.X = adata_cat.X.toarray()

In [35]:
adata_cat.X.max(), adata_cat.X.min()

(array(7.8459435, dtype=float32), array(0., dtype=float32))

In [36]:
import sys
sys.path.append('../')
import preprocessing_tools as pt

# DEGs

In [37]:
tissue_degs = pt.compute_degs(
    adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart', 
    control_name='Blood',
    condition_names=['Blood', 'Heart'],
    synergy=False
    )

adata_cat.uns['rank_genes_groups_tissue'] = {
    'Heart' :tissue_degs,
}

  adata_cat.uns['rank_genes_groups_tissue'] = {


In [38]:
treatment_degs = pt.compute_degs(
    adata_cat, 
    cov_key='tissue', 
    cond_key='on_steroids', 
    stim_name='True', 
    control_name='False',
    condition_names=['True', 'False'],
    synergy=False
    )

adata_cat.uns['rank_genes_groups_on_steroids'] = {
    'True' :treatment_degs,
}

ERROR in  Heart SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  Blood SKIPPING.. 'NoneType' object has no attribute 'columns'
ERROR in  all SKIPPING.. 'NoneType' object has no attribute 'columns'


In [39]:
adata_cat.X = adata_cat.layers['counts'].copy()

In [40]:
adata_cat.obs['sc_cell_ids'] = list(range(adata_cat.shape[0]))

In [41]:
adata_split = pt.create_split_cols(
    adata=adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart',
    random_state=42
)

In [42]:
adata_split.write_h5ad('../../preprocessed_datasets/myocarditis_org_pre_steroid.h5ad')

In [43]:
import pandas as pd

In [44]:
pd.crosstab(adata_cat.obs['tissue'], adata_cat.obs['donor'])

donor,SIC_48,SIC_153,SIC_164,SIC_171,SIC_197,SIC_199,SIC_217,SIC_258,SIC_264
tissue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Blood,1624,3816,466,879,440,399,295,1657,158
Heart,59,92,79,500,14,49,67,164,541


In [45]:
adata_split.X.max()

931.0

In [48]:
'","'.join(adata_split.obs['donor'].unique().tolist())

'SIC_258","SIC_199","SIC_264","SIC_48","SIC_197","SIC_164","SIC_217","SIC_153","SIC_171'

In [None]:
adata_split.X

In [None]:
adata_split.obs['tissue'].value_counts()