In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

# Read blood & heart original data

In [2]:
adata_heart = sc.read_h5ad(
    '../../original_datasets/Myocarditis/GSE228597_combined_tissue_data.h5ad'
    )

adata_blood = sc.read_h5ad(
    '../../original_datasets/Myocarditis/GSE228597_combined_pbmc_data.h5ad'
    )

adata_heart.X = adata_heart.raw.X.toarray().copy()
adata_blood.X = adata_blood.raw.X.toarray().copy()

# Rename donor

In [3]:
sample_ids = adata_blood.obs['sample_id'].tolist()
donors = [x.split('_')[0] + '_' + x.split('_')[1] if len(x.split('_')) > 1 else x for x in sample_ids]
adata_blood.obs['donor'] = donors

# Subset to CD8 and NK

In [4]:
adata_blood = adata_blood[adata_blood.obs['lineage'] == 'CD8 and NK'].copy()
adata_heart = adata_heart[adata_heart.obs['lineage_names'] == 'T and NK cells'].copy()

adata_blood.shape, adata_heart.shape

((134325, 26425), (7781, 28034))

In [5]:
adata_blood.shape, adata_heart.shape

((134325, 26425), (7781, 28034))

# Subset to myocarditis

In [6]:
adata_blood = adata_blood[adata_blood.obs['condition'] == 'myocarditis']
adata_heart = adata_heart[adata_heart.obs['condition'] == 'myocarditis']

adata_blood.shape, adata_heart.shape

((83080, 26425), (4133, 28034))

# Rename T & NK subsets in heart

In [7]:
# Marker sets
# For T vs NK separation, rely on *core T markers* (CD3/TRAC/LCK/TRBC*) rather than IL7R/LTB,
# otherwise cytotoxic CD8 T cells can be mis-called as NK.

# Core T program (works for CD4 and CD8)
t_markers = ["CD3D", "CD3E", "TRAC", "LCK", "TRBC1", "TRBC2"]

# NK program (CD3-)
nk_markers = ["GNLY", "KLRD1", "FCER1G", "TYROBP", "FCGR3A", "NKG7"]

# CD4 vs CD8 (optional, within T)
cd4_markers = ["IL7R", "CCR7", "LTB", "MAL", "LEF1"]
cd8_markers = ["CD8A", "CD8B", "CTSW", "GZMB", "PRF1"]

In [8]:
# Build simple, interpretable scores on log-normalized expression (without modifying `adata_heart.X`)
# This avoids score_genes control-gene effects and works well for cytotoxic CD8 vs NK separation.

# Use only genes present in this AnnData
var = set(adata_heart.var_names)

def _present(genes):
    return [g for g in genes if g in var]

t_markers_in = _present(t_markers)
nk_markers_in = _present(nk_markers)
cd8_markers_in = _present(cd8_markers)

print('T markers present:', len(t_markers_in), '/', len(t_markers), 'missing:', sorted(set(t_markers) - set(t_markers_in)))
print('NK markers present:', len(nk_markers_in), '/', len(nk_markers), 'missing:', sorted(set(nk_markers) - set(nk_markers_in)))
print('CD8 markers present:', len(cd8_markers_in), '/', len(cd8_markers), 'missing:', sorted(set(cd8_markers) - set(cd8_markers_in)))

use_genes = sorted(set(t_markers_in + nk_markers_in + cd8_markers_in))

tmp = adata_heart[:, use_genes].copy()
sc.pp.normalize_total(tmp, target_sum=1e4)
sc.pp.log1p(tmp)

X = tmp.X
if hasattr(X, 'toarray'):
    X = X.toarray()
expr = pd.DataFrame(X, columns=use_genes, index=tmp.obs_names)

def _mean_score(gs):
    if len(gs) == 0:
        return pd.Series(0.0, index=expr.index)
    return expr[gs].mean(axis=1)

adata_heart.obs['t_score'] = _mean_score(t_markers_in).values
adata_heart.obs['nk_score'] = _mean_score(nk_markers_in).values
adata_heart.obs['cd8_score'] = _mean_score(cd8_markers_in).values
# keep CD8A for gating if available
if 'CD8A' in expr.columns:
    adata_heart.obs['CD8A_expr'] = expr['CD8A'].values

T markers present: 6 / 6 missing: []
NK markers present: 6 / 6 missing: []
CD8 markers present: 5 / 5 missing: []


  return fn(*args_all, **kw)
  adata_heart.obs['t_score'] = _mean_score(t_markers_in).values


In [9]:
# Gate CD8 vs NK (tune thresholds as needed)
T_SCORE_THRESH = 2.0
NK_SCORE_THRESH = 2.0
CD8A_THRESH = 0.5  # on log1p normalized expression; ignored if CD8A missing

s_t = adata_heart.obs['t_score']
s_nk = adata_heart.obs['nk_score']
s_cd8 = adata_heart.obs['cd8_score']

lab = pd.Series('ambiguous', index=adata_heart.obs_names, dtype='object')

# NK: strong NK program AND weak T program
lab.loc[(s_nk > NK_SCORE_THRESH) & (s_t < T_SCORE_THRESH)] = 'NK'

# CD8: strong T program AND (CD8A present or CD8 program high)
if 'CD8A_expr' in adata_heart.obs.columns:
    lab.loc[(s_t > T_SCORE_THRESH) & (adata_heart.obs['CD8A_expr'] > CD8A_THRESH)] = 'CD8'
else:
    lab.loc[(s_t > T_SCORE_THRESH) & (s_cd8 > NK_SCORE_THRESH)] = 'CD8'

adata_heart.obs['t_nk_subtype'] = lab.astype('category')
adata_heart.obs['t_nk_subtype'].value_counts()

t_nk_subtype
ambiguous    1771
CD8          1646
NK            716
Name: count, dtype: int64

# Subset to annotated CD8 & NK

In [10]:
adata_heart = adata_heart[adata_heart.obs['t_nk_subtype'].isin(['CD8', 'NK'])].copy()
print(adata_heart.obs['t_nk_subtype'].value_counts())

# Quick sanity: do assigned groups express the expected canonical markers?
markers_to_check = ["CD3D", "TRAC", "CD8A", "CD8B", "NKG7", "GNLY", "KLRD1", "FCGR3A"]
markers_to_check = [g for g in markers_to_check if g in adata_heart.var_names]

if len(markers_to_check) > 0:
    tmp = adata_heart[:, markers_to_check].copy()
    # normalize/log1p just for this diagnostic table
    sc.pp.normalize_total(tmp, target_sum=1e4)
    sc.pp.log1p(tmp)
    X = tmp.X
    if hasattr(X, 'toarray'):
        X = X.toarray()
    means = pd.DataFrame(X, columns=markers_to_check, index=tmp.obs_names)
    means['t_nk_subtype'] = tmp.obs['t_nk_subtype'].values
    print('\nMarker means by assigned subtype (log1p norm):')
    print(means.groupby('t_nk_subtype')[markers_to_check].mean().round(3))
else:
    print('No diagnostic markers found in var_names; cannot sanity-check assignments.')

t_nk_subtype
CD8    1646
NK      716
Name: count, dtype: int64

Marker means by assigned subtype (log1p norm):
               CD3D   TRAC   CD8A   CD8B   NKG7   GNLY  KLRD1  FCGR3A
t_nk_subtype                                                         
CD8           6.324  3.265  7.159  4.329  8.156  3.316  2.861   0.686
NK            0.535  0.209  1.305  0.493  8.257  7.209  4.035   2.332


  print(means.groupby('t_nk_subtype')[markers_to_check].mean().round(3))


# Subset blood to pre and post_steroid

In [11]:
adata_blood = adata_blood[adata_blood.obs['timepoint_cat'].isin(['pre_steroid', 'post_steroid'])]

In [12]:
adata_blood.obs['on_steroids'] = 'False'
adata_blood.obs.loc[adata_blood.obs['timepoint_cat'] == 'post_steroid', 'on_steroids'] = 'True'
adata_blood.obs['on_steroids'] = adata_blood.obs['on_steroids'].astype('category')

adata_blood.obs['on_steroids'].value_counts()

  adata_blood.obs['on_steroids'] = 'False'


on_steroids
True     34690
False    33523
Name: count, dtype: int64

# Rename tissue

In [13]:
adata_blood.obs['tissue'] = ['Blood'] * adata_blood.shape[0]
adata_blood.obs['tissue'] = adata_blood.obs['tissue'].astype('category')

adata_heart.obs['tissue'] = ['Heart'] * adata_heart.shape[0]
adata_heart.obs['tissue'] = adata_heart.obs['tissue'].astype('category')

# Create cell type column

In [14]:
adata_heart.obs['cell_type'] = adata_heart.obs['t_nk_subtype'].copy()
adata_heart.obs['cell_type'] = adata_heart.obs['cell_type'].astype('category')
adata_heart.obs['cell_type'].value_counts()

cell_type
CD8    1646
NK      716
Name: count, dtype: int64

In [15]:
adata_blood.obs['cluster_name'].value_counts()

cluster_name
b-CD8: ZNF683, GZMB         14164
b-NK: SPON2, FGFBP2         12711
b-CD8: CCL5, GNLY            8205
b-CD8: GZMK, DUSP2           7224
b-CD8: CCR7, CD45RA-prot     5848
b-MAIT: TRAV1-2, KLRB1       4413
b-CD8: GZMK, TCF7            3968
b-CD8: CX3CR1, TBX21         3487
b-CD8: cycling               2877
b-CD8T/NK: MT-high           2833
b-NK: XCL1, SPTSSB           2261
b-NK: cycling                 222
Name: count, dtype: int64

In [16]:
b_ctypes_raw = adata_blood.obs['cluster_name'].tolist()
b_ctypes_raw = [x.split(':')[0].replace('b-', '') for x in b_ctypes_raw]

# Harmonize to the same broad labels as heart
b_ctypes = []
for x in b_ctypes_raw:
    x = str(x)
    if x.startswith('NK') or x == 'NK':
        b_ctypes.append('NK')
    elif x.startswith('CD8') or ('CD8' in x):
        b_ctypes.append('CD8')
    else:
        b_ctypes.append('other')

adata_blood.obs['cell_type'] = pd.Categorical(b_ctypes)

# Keep only the shared evaluation classes
adata_blood = adata_blood[adata_blood.obs['cell_type'].isin(['CD8', 'NK'])].copy()

adata_blood.obs['cell_type'].value_counts()

cell_type
CD8    48606
NK     15194
Name: count, dtype: int64

# Optional: focus on a single shared cell type (recommended to debug)

In [17]:
FOCUS_CELL_TYPE = "CD8"  # set to "NK" or None

if FOCUS_CELL_TYPE is not None:
    adata_blood = adata_blood[adata_blood.obs['cell_type'] == FOCUS_CELL_TYPE].copy()
    adata_heart = adata_heart[adata_heart.obs['cell_type'] == FOCUS_CELL_TYPE].copy()

adata_blood.shape, adata_heart.shape

((48606, 26425), (1646, 28034))

# Subset obs columns

In [18]:
adata_blood.obs = adata_blood.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]
adata_heart.obs = adata_heart.obs[['tissue', 'on_steroids', 'donor', 'cell_type']]

# Keep common genes

In [19]:
intersection_genes = sorted(set(adata_blood.var_names) & set(adata_heart.var_names))
adata_blood = adata_blood[:, list(intersection_genes)].copy()
adata_heart = adata_heart[:, list(intersection_genes)].copy()

In [20]:
assert adata_blood.var_names.tolist() == adata_heart.var_names.tolist()

# Keep common donors

In [21]:
pbmc_donors = np.unique(adata_blood.obs['donor'].tolist())
heart_donors = np.unique(adata_heart.obs['donor'].tolist())
common_donors = [x for x in pbmc_donors if x in heart_donors]

len(common_donors), len(pbmc_donors), len(heart_donors)

(12, 24, 14)

In [22]:
adata_blood = adata_blood[adata_blood.obs['donor'].isin(common_donors)]
adata_heart = adata_heart[adata_heart.obs['donor'].isin(common_donors)]

In [23]:
adata_blood.shape, adata_heart.shape

((21472, 25196), (1280, 25196))

# Concatenate data

In [24]:
adata_cat = adata_heart.concatenate(adata_blood)

  adata_cat = adata_heart.concatenate(adata_blood)


In [25]:
# Balance Heart vs Blood within each donor (downsample the larger tissue per donor)
import numpy as np

rng = np.random.default_rng(0)  # reproducible
keep = []

for d in adata_cat.obs["donor"].unique():
    heart = adata_cat.obs_names[(adata_cat.obs["donor"] == d) & (adata_cat.obs["tissue"] == "Heart")]
    blood = adata_cat.obs_names[(adata_cat.obs["donor"] == d) & (adata_cat.obs["tissue"] == "Blood")]
    n = min(len(heart), len(blood))
    if n == 0:
        continue  # drop donors missing one tissue
    keep.extend(rng.choice(heart, size=n, replace=False))
    keep.extend(rng.choice(blood, size=n, replace=False))

adata_cat = adata_cat[keep].copy()

In [26]:
adata_cat.X.max(), adata_cat.X.min()

(670.0, 0.0)

In [27]:
adata_cat.layers['counts'] = adata_cat.X.copy()

# Filter

In [28]:
adata_cat.uns = {}

sc.pp.filter_cells(adata_cat, min_counts=100)
sc.pp.filter_genes(adata_cat, min_counts=5)

# Normalize

In [29]:
sc.pp.normalize_total(
    adata_cat, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata_cat)

adata_cat.shape, adata_cat.X.max(), adata_cat.X.min()

((1332, 11491), 8.0222, 0.0)

In [30]:
adata_cat.obs['placeholder'] = ['1'] * adata_cat.shape[0]

In [31]:
adata_cat

AnnData object with n_obs × n_vars = 1332 × 11491
    obs: 'tissue', 'on_steroids', 'donor', 'cell_type', 'batch', 'n_counts', 'placeholder'
    var: 'n_cells-0', 'percent_cells-0', 'robust-0', 'highly_variable_features-0', 'n_cells-1', 'percent_cells-1', 'robust-1', 'highly_variable_features-1', 'featureid-1', 'n_counts'
    uns: 'log1p'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

# HVG

In [32]:
sc.pp.highly_variable_genes(
    adata_cat, 
    n_top_genes=5000
    )
adata_cat = adata_cat[:, adata_cat.var['highly_variable']]

In [33]:
adata_cat.X = adata_cat.X.toarray()

In [34]:
adata_cat.X.max(), adata_cat.X.min()

(array(8.0222, dtype=float32), array(0., dtype=float32))

In [35]:
import sys
sys.path.append('../')
import preprocessing_tools as pt

# DEGs

In [36]:
tissue_degs = pt.compute_degs(
    adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart', 
    control_name='Blood',
    condition_names=['Blood', 'Heart'],
    synergy=False,
    method='wilcoxon'
    )

adata_cat.uns['rank_genes_groups_tissue'] = {
    'Heart' :tissue_degs,
}

  adata_cat.uns['rank_genes_groups_tissue'] = {


In [37]:
adata_cat.X = adata_cat.layers['counts'].copy()

In [38]:
adata_cat.obs['sc_cell_ids'] = list(range(adata_cat.shape[0]))

In [39]:
adata_split = pt.create_split_cols(
    adata=adata_cat, 
    cov_key='donor', 
    cond_key='tissue', 
    stim_name='Heart',
    random_state=42
)

In [40]:
adata_split

AnnData object with n_obs × n_vars = 1332 × 5000
    obs: 'tissue', 'on_steroids', 'donor', 'cell_type', 'batch', 'n_counts', 'placeholder', 'sc_cell_ids', 'split_Heart_SIC_153', 'split_Heart_SIC_164', 'split_Heart_SIC_171', 'split_Heart_SIC_175', 'split_Heart_SIC_177', 'split_Heart_SIC_197', 'split_Heart_SIC_199', 'split_Heart_SIC_217', 'split_Heart_SIC_232', 'split_Heart_SIC_258', 'split_Heart_SIC_264', 'split_Heart_SIC_48'
    var: 'n_cells-0', 'percent_cells-0', 'robust-0', 'highly_variable_features-0', 'n_cells-1', 'percent_cells-1', 'robust-1', 'highly_variable_features-1', 'featureid-1', 'n_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg', 'rank_genes_groups_tissue'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    layers: 'counts'

In [41]:
import pandas as pd

In [42]:
pd.crosstab(adata_split.obs['donor'], adata_split.obs['tissue'])

tissue,Blood,Heart
donor,Unnamed: 1_level_1,Unnamed: 2_level_1
SIC_153,2,2
SIC_164,5,5
SIC_171,116,116
SIC_175,8,8
SIC_177,46,46
SIC_197,4,4
SIC_199,9,9
SIC_217,5,5
SIC_232,14,14
SIC_258,28,28


In [None]:
['SIC_258', 'SIC_264', 'SIC_171', 'SIC_177']

In [43]:
out_path = "../../preprocessed_datasets/myocarditis_balanced_5k.h5ad"

adata_split.write_h5ad(out_path)

In [None]:
import pandas as pd

In [None]:
pd.crosstab(adata_cat.obs['tissue'], adata_cat.obs['donor'])

In [None]:
adata_split.X.max()

In [None]:
adata_split.X

In [None]:
adata_split.obs['tissue'].value_counts()

In [None]:
adata_split.obs['donor']