In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata
import sys

sys.path.append('../')
import preprocessing_tools as pt

In [2]:
data_path = "../../original_datasets/Tabula"

In [3]:
adata_sapiens = sc.read_h5ad(f'{data_path}/40f8b1a3-9f76-4ac4-8761-32078555ed4e.h5ad')
adata_muris = sc.read_h5ad(f'{data_path}/b65b771c-baae-4b03-befc-3db63e6b9fd0.h5ad')

In [4]:
adata_muris

AnnData object with n_obs × n_vars = 24540 × 17943
    obs: 'age', 'cell', 'free_annotation', 'method', 'donor_id', 'n_genes', 'subtissue', 'n_counts', 'louvain', 'leiden', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'n_cells', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'age_colors', 'citation', 'leiden', 'louvain', 'neighbors', 'organism', 'organism_ontology_term_id', 'pca', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities'

In [5]:
adata_muris.X.max(), adata_sapiens.X.max()

(10.0, 8.976758)

In [6]:
adata_muris.X = adata_muris.raw.X
adata_sapiens.X = adata_sapiens.raw.X

In [7]:
adata_muris.X

<24540x17943 sparse matrix of type '<class 'numpy.float32'>'
	with 37375133 stored elements in Compressed Sparse Row format>

In [8]:
sapien_genes = adata_sapiens.var['feature_name'].tolist()
muris_genes = adata_muris.var['feature_name'].tolist()

sapiens_genes = [x.upper() for x in sapien_genes]
muris_genes = [x.upper() for x in muris_genes]

In [9]:
overlap = [x for x in sapien_genes if x in muris_genes]

In [10]:
len(overlap)

14671

In [11]:
! wget https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt

--2025-12-12 02:43:46--  https://www.informatics.jax.org/downloads/reports/HOM_MouseHumanSequence.rpt
Resolving www.informatics.jax.org (www.informatics.jax.org)... 34.102.200.12
Connecting to www.informatics.jax.org (www.informatics.jax.org)|34.102.200.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15081658 (14M)
Saving to: ‘HOM_MouseHumanSequence.rpt’


2025-12-12 02:43:48 (10.3 MB/s) - ‘HOM_MouseHumanSequence.rpt’ saved [15081658/15081658]



In [12]:
from pathlib import Path
FILE_HOMOLOGY = Path("HOM_MouseHumanSequence.rpt")
mouse_tax, human_tax = "10090", "9606"

In [13]:
# ------------------------------------------------------------------
# 1 · read MGI report ----------------------------------------------
# ------------------------------------------------------------------
cols = [
    "homology_id", "organism", "taxon_id", "symbol", "entrez_id",
    "marker_type", "synonyms", "chromosome", "start", "end", "strand",
    "homology_type"
]
hom = pd.read_csv(FILE_HOMOLOGY, sep="\t", header=None,
                  names=cols, usecols=range(len(cols)), dtype=str)

# ------------------------------------------------------------------
# 2 · keep groups that have BOTH species ---------------------------
# ------------------------------------------------------------------
has_mouse = hom["taxon_id"] == mouse_tax
has_human = hom["taxon_id"] == human_tax
good_ids  = hom.loc[has_mouse | has_human, "homology_id"].value_counts().index

mh = hom[hom["homology_id"].isin(good_ids)]

# ------------------------------------------------------------------
# 3 · pick ONE row per species in each group -----------------------
# ------------------------------------------------------------------
mouse_rows = (mh[mh["taxon_id"] == mouse_tax]
                .groupby("homology_id").first())  # first = arbitrary but consistent
human_rows = (mh[mh["taxon_id"] == human_tax]
                .groupby("homology_id").first())

In [14]:
orth = (mouse_rows["symbol"]
        .to_frame("mouse_symbol")
        .join(human_rows["symbol"].to_frame("human_symbol"),
              how="inner"))

orth = orth.dropna().drop_duplicates()

In [15]:
for ad in (adata_muris, adata_sapiens):
    ad.var_names = ad.var["feature_name"].tolist()        # or whatever column holds symbols
    ad.var_names_make_unique()



In [16]:
# 1) Filter mouse genes to those present in the ortholog table
keep_mouse = orth["mouse_symbol"]
adata_muris = adata_muris[:, adata_muris.var_names.isin(keep_mouse)]

In [17]:
# 2) Rename mouse genes to their human ortholog symbols
mapper = dict(zip(orth["mouse_symbol"], orth["human_symbol"]))
adata_muris.var_names = adata_muris.var_names.map(mapper)

In [18]:
adata_muris.var_names_make_unique()

In [19]:
# 3) Keep only genes the (now-renamed) mouse matrix shares with the human one
common = adata_muris.var_names.intersection(adata_sapiens.var_names)

In [20]:
len(common)

15383

In [21]:
adata_muris  = adata_muris[:, common]
adata_sapiens  = adata_sapiens[:, common]

In [22]:
adata_muris = adata_muris[:, adata_sapiens.var_names]

In [23]:
obs_names = ['cell_type', 'development_stage', 'sex']

In [24]:
assert adata_muris.var_names.tolist() == adata_sapiens.var_names.tolist()

In [25]:
adata_muris.obs['cell_type'].value_counts()

cell_type
classical monocyte                                      7922
bronchial smooth muscle cell                            2339
intermediate monocyte                                   1749
fibroblast of lung                                      1523
B cell                                                  1501
alveolar macrophage                                     1366
natural killer cell                                     1193
lung macrophage                                         1157
non-classical monocyte                                  1010
CD8-positive, alpha-beta T cell                          870
neutrophil                                               552
CD4-positive, alpha-beta T cell                          551
adventitial cell                                         526
mature NK T cell                                         420
vein endothelial cell                                    320
T cell                                                   251
myeloid dendri

In [26]:
adata_sapiens.obs['cell_type'].value_counts()

cell_type
macrophage                                16486
pulmonary alveolar type 2 cell            11594
capillary endothelial cell                 7243
basal cell                                 4015
pulmonary alveolar type 1 cell             3116
intermediate monocyte                      2785
CD4-positive, alpha-beta T cell            2124
CD8-positive, alpha-beta T cell            1898
endothelial cell of artery                 1754
club cell                                  1747
classical monocyte                         1569
vein endothelial cell                      1336
basophil                                   1322
lung multiciliated epithelial cell         1209
alveolar adventitial fibroblast            1113
respiratory tract goblet cell              1040
natural killer cell                        1019
pericyte                                    739
B cell                                      663
adventitial cell                            581
non-classical monocyte        

In [27]:
mice_ctype = adata_muris.obs['cell_type'].unique()
human_ctype = adata_sapiens.obs['cell_type'].unique()
len(mice_ctype), len(human_ctype)

common_ctypes = set(mice_ctype) & set(human_ctype)
common_ctypes = list(common_ctypes)

In [28]:
len(common_ctypes)

20

In [29]:
adata_muris = adata_muris[adata_muris.obs['cell_type'].isin(common_ctypes)]
adata_sapiens = adata_sapiens[adata_sapiens.obs['cell_type'].isin(common_ctypes)]

In [30]:
adata_muris

View of AnnData object with n_obs × n_vars = 19694 × 15383
    obs: 'age', 'cell', 'free_annotation', 'method', 'donor_id', 'n_genes', 'subtissue', 'n_counts', 'louvain', 'leiden', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'n_cells', 'means', 'dispersions', 'dispersions_norm', 'highly_variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'age_colors', 'citation', 'leiden', 'louvain', 'neighbors', 'organism', 'organism_ontology_term_id', 'pca', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'
    obsp: 'connect

In [31]:
adata_muris.X.max(), adata_muris.X.min()

(5659.0, 0.0)

In [32]:
adata_sapiens.X.max(), adata_sapiens.X.min()

(2491817.0, 0.0)

In [33]:
adata_sapiens.obs['specie'] = 'sapiens'
adata_muris.obs['specie'] = 'muris'

  adata_sapiens.obs['specie'] = 'sapiens'
  adata_muris.obs['specie'] = 'muris'


In [34]:
import anndata as ad

In [35]:
covariate_key = 'cell_type'
condition_key = 'specie'
stim_name = 'sapiens'
control_name = 'muris'

In [36]:
adata = ad.concat([adata_sapiens, adata_muris])

In [37]:
adata.layers['counts'] = adata.X.copy()
adata.raw = adata.copy()
adata.uns = {}

sc.pp.filter_cells(adata, min_counts=100)
sc.pp.filter_genes(adata, min_counts=5)

sc.pp.normalize_total(
    adata, 
    target_sum=1e4, 
    exclude_highly_expressed=True
    )
sc.pp.log1p(adata)



In [38]:
adata

AnnData object with n_obs × n_vars = 48867 × 15195
    obs: 'donor_id', 'method', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'free_annotation', 'suspension_type', 'tissue_type', 'disease_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'specie', 'n_counts'
    var: 'n_counts'
    uns: 'log1p'
    obsm: 'X_pca', 'X_umap'
    layers: 'counts'

In [39]:
sc.pp.highly_variable_genes(
    adata, 
    n_top_genes=5000, 
    subset=True
)

In [40]:
adata.obs['cov_cond'] = adata.obs[covariate_key].astype(str) + '_' + adata.obs[condition_key].astype(str)

In [41]:
adata.obs['cov_cond']

TSP2_Lung_proxmedialdistal_SS2_B113373_B134458_Immune_D2     non-classical monocyte_sapiens
TSP2_Lung_proxmedialdistal_SS2_B113373_B134458_Stromal_I2                neutrophil_sapiens
TSP2_Lung_proxmedialdistal_SS2_B113373_B134458_Empty_M12                 neutrophil_sapiens
TSP2_Lung_proxmedialdistal_SS2_B113373_B134458_Stromal_P8                neutrophil_sapiens
TSP2_Lung_proxmedialdistal_SS2_B113373_B134458_Stromal_P2               plasma cell_sapiens
                                                                          ...              
10X_P8_13_TTTATGCCATCCAACA-1                                      natural killer cell_muris
10X_P8_13_TTTCCTCGTAAGGGAA-1                                         adventitial cell_muris
10X_P8_13_TTTGCGCTCAAACCAC-1                                      natural killer cell_muris
10X_P8_13_TTTGGTTTCCTGTACC-1                                      natural killer cell_muris
10X_P8_13_TTTGTCACATATGAGA-1                                         adventitial

In [42]:
stim_degs = pt.compute_degs(
    adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name, 
    control_name=control_name,
    condition_names=[control_name, stim_name],
    method='wilcoxon'
)

In [43]:
adata.uns[f'rank_genes_groups_{condition_key}'] = {
    stim_name :stim_degs,
}

In [44]:
adata.X.max(), adata.X.min()

(12.285428, 0.0)

In [45]:
adata.X = adata.layers['counts'].copy()

In [46]:
adata.X.max(), adata.X.min()

(2491817.0, 0.0)

In [47]:
adata_split = pt.create_split_cols(
    adata=adata, 
    cov_key=covariate_key, 
    cond_key=condition_key, 
    stim_name=stim_name,
    random_state=42
)

In [48]:
adata_split.obs['sc_cell_ids'] = list(range(adata.shape[0]))

In [49]:
adata_split.write_h5ad('../../preprocessed_datasets/tabula.h5ad')

In [50]:
adata_split.obs['cell_type'].value_counts()

cell_type
pulmonary alveolar type 2 cell          11719
classical monocyte                       9491
intermediate monocyte                    4534
CD8-positive, alpha-beta T cell          2768
CD4-positive, alpha-beta T cell          2675
bronchial smooth muscle cell             2559
natural killer cell                      2212
B cell                                   2164
club cell                                1762
vein endothelial cell                    1656
non-classical monocyte                   1537
basophil                                 1452
adventitial cell                         1107
neutrophil                                923
pericyte                                  800
mature NK T cell                          583
endothelial cell of lymphatic vessel      356
myeloid dendritic cell                    278
plasma cell                               197
plasmacytoid dendritic cell                94
Name: count, dtype: int64

In [51]:
adata_split

AnnData object with n_obs × n_vars = 48867 × 5000
    obs: 'donor_id', 'method', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'free_annotation', 'suspension_type', 'tissue_type', 'disease_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'specie', 'n_counts', 'cov_cond', 'split_sapiens_B cell', 'split_sapiens_CD4-positive, alpha-beta T cell', 'split_sapiens_CD8-positive, alpha-beta T cell', 'split_sapiens_adventitial cell', 'split_sapiens_basophil', 'split_sapiens_bronchial smooth muscle cell', 'split_sapiens_classical monocyte', 'split_sapiens_club cell', 'split_sapiens_endothelial cell of lymphatic vessel', 'split_sapiens_intermediate monocyte', 'split_sapiens_mature NK T cell', 'split_sapiens_myeloid dendritic cell', 'split_sapiens_nat