In [1]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from pyprojroot import here

random_seed = 42

### Parameters

In [2]:
n_PC=30
batch_keys = ['chemistry','libraryID','studyID','sex','binned_age']

### Loading reference data

In [3]:
adataR = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad"))
adataR

AnnData object with n_obs × n_vars = 4279352 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
    uns: 'log1p'

### Preprocessing

In [4]:
## Loading already log-transformed data, thus avoiding the following steps
#sc.pp.normalize_total(adataR, target_sum=1e4)
#sc.pp.log1p(adataR)

In [5]:
# We followed the tutorial from the python package (https://symphonypy.readthedocs.io/en/latest/usage.html)
sc.pp.scale(adataR)

In [6]:
sc.pp.pca(adataR, 
          n_comps=n_PC, 
          use_highly_variable = False,
          zero_center=False)

### Integration with Harmony

In [7]:
sp.pp.harmony_integrate(adataR,
                        key=batch_keys, 
                        verbose = True, 
                        random_seed=random_seed,  
                        **{'max_iter_harmony':100}, # harmony_kwargs | should reach convergence before
                       )

Harmony integration with harmonypy is preforming.


2024-06-23 15:26:49,062 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


2024-06-23 15:45:19,178 - harmonypy - INFO - sklearn.KMeans initialization complete.


2024-06-23 15:46:19,997 - harmonypy - INFO - Iteration 1 of 100


2024-06-23 18:07:38,872 - harmonypy - INFO - Iteration 2 of 100


2024-06-23 20:28:54,593 - harmonypy - INFO - Iteration 3 of 100


2024-06-23 22:49:40,704 - harmonypy - INFO - Iteration 4 of 100


2024-06-24 01:14:25,082 - harmonypy - INFO - Iteration 5 of 100


2024-06-24 03:38:01,656 - harmonypy - INFO - Iteration 6 of 100


2024-06-24 05:58:44,823 - harmonypy - INFO - Iteration 7 of 100


2024-06-24 08:18:27,043 - harmonypy - INFO - Iteration 8 of 100


2024-06-24 10:03:52,014 - harmonypy - INFO - Iteration 9 of 100


2024-06-24 11:18:42,896 - harmonypy - INFO - Iteration 10 of 100


2024-06-24 12:49:36,111 - harmonypy - INFO - Iteration 11 of 100


2024-06-24 14:03:58,267 - harmonypy - INFO - Converged after 11 iterations


### Save corrected embedded space

In [8]:
adataPC = sc.AnnData(X = adataR.obsm['X_pca_harmony'], obs = adataR.obs[[]])
adataPC

AnnData object with n_obs × n_vars = 4279352 × 30

In [9]:
adataPC.write(here('03_downstream_analysis/04_integration_with_annotation/results/02_Harmony_correctedPCs_noRBCnPlat.h5ad'), compression='gzip')