In [1]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from pyprojroot import here

random_seed = 42

### Parameters

In [None]:
n_PC=30
batch_keys = ['chemistry','libraryID','studyID','sex','binned_age']

### Loading reference data

In [None]:
adataR = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad"))
adataR

### Preprocessing

In [None]:
## Loading already log-transformed data, thus avoiding the following steps
#sc.pp.normalize_total(adataR, target_sum=1e4)
#sc.pp.log1p(adataR)

In [None]:
# We followed the tutorial from the python package (https://symphonypy.readthedocs.io/en/latest/usage.html)
sc.pp.scale(adataR)

In [None]:
sc.pp.pca(adataR, 
          n_comps=n_PC, 
          use_highly_variable = False,
          zero_center=False)

### Integration with Harmony

In [None]:
sp.pp.harmony_integrate(adataR,
                        key=batch_keys, 
                        verbose = True, 
                        random_seed=random_seed,  
                        **{'max_iter_harmony':100}, # harmony_kwargs | should reach convergence before
                       )

### Save corrected embedded space

In [10]:
adataPC = sc.AnnData(X = adataR.obsm['X_pca_harmony'], obs = adataR.obs[[]])
adataPC

AnnData object with n_obs × n_vars = 4435922 × 10

In [None]:
adataPC.write(here('03_downstream_analysis/04_integration_with_annotation/results/02_Harmony_correctedPCs_noRBCnPlat.h5ad'), compression='gzip')