In [1]:
# Parameters
nPC = 50
batchKeys = "chemistry"


In [2]:
for v in ['nPC','batchKeys']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

nPC = 50
batchKeys = chemistry


In [3]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from pyprojroot import here

random_seed = 42

### Functions

In [4]:
def aggregating_features(Z = None, obsDF = None, mode = 'mean', obs_names_col = []):
    Zdf = pd.DataFrame(Z)
    for c in obsDF.columns:
        Zdf[c] = obsDF[c].tolist()
    if mode in ['mean','avarage']:
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).mean()
    elif mode == 'sum':
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).sum()
    else:
        raise ValueError(f"mode {mode} not supported. Available mode are 'mean' or 'sum'")

    grpObs = pd.DataFrame(Zaggr.index.tolist(), columns=obsDF.columns.tolist())

    if len(obs_names_col) == 0:
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs, )
    elif all([c in obsDF.columns.tolist() for c in obs_names_col]):
        grpObs.index = grpObs[obs_names_col].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs)
    else:
        raise ValueError(f"Impossible to use {obs_names_col} as index. It's not present in obsDF")
    return grpAdata

### Parameters

In [5]:
batch_key_ref = batchKeys.split(',')
batch_key_ref

['chemistry']

### Loading reference data

In [6]:
adataR = sc.read_h5ad(here('03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse.h5ad'))
adataR

AnnData object with n_obs × n_vars = 4435922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'

### Preprocessing

In [7]:
sc.pp.normalize_total(adataR, target_sum=1e4)
sc.pp.log1p(adataR)

In [8]:
# We followed the tutorial from the python package (https://symphonypy.readthedocs.io/en/latest/usage.html)
sc.pp.scale(adataR)

In [9]:
sc.pp.pca(adataR, 
          n_comps=nPC, 
          use_highly_variable = False,
          zero_center=False)

### Integration with Harmony

In [10]:
sp.pp.harmony_integrate(adataR,
                        key=batch_key_ref, 
                        verbose = True, 
                        random_seed=random_seed,  
                        **{'max_iter_harmony':100}, # harmony_kwargs | should reach convergence before
                       )

Harmony integration with harmonypy is preforming.


2024-06-17 03:36:30,457 - harmonypy - INFO - Computing initial centroids with sklearn.KMeans...


2024-06-17 03:53:54,023 - harmonypy - INFO - sklearn.KMeans initialization complete.


2024-06-17 03:54:34,766 - harmonypy - INFO - Iteration 1 of 100


2024-06-17 04:57:35,308 - harmonypy - INFO - Iteration 2 of 100


2024-06-17 06:00:38,717 - harmonypy - INFO - Iteration 3 of 100


2024-06-17 07:04:11,068 - harmonypy - INFO - Converged after 3 iterations


### Save integrated object

In [11]:
adataR.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_{nPC}nPC_Batch_{batchKeys}.h5ad"), compression='gzip')

### Generating pseudobulk from latent space

In [12]:
emb_pseudobulk_train = aggregating_features(Z = adataR.obsm['X_pca_harmony'], 
                                            obsDF = adataR.obs[['sampleID','Level1','disease']], 
                                            mode = 'mean', obs_names_col=['sampleID','Level1'])

In [13]:
emb_pseudobulk_train.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_PSEUDOBULK_{nPC}nPC_Batch_{batchKeys}.h5ad"), compression='gzip')