In [None]:
for v in ['nPC','batchKeys']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

In [1]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from pyprojroot import here

random_seed = 42

### Functions

In [None]:
def aggregating_features(Z = None, obsDF = None, mode = 'mean', obs_names_col = []):
    Zdf = pd.DataFrame(Z)
    for c in obsDF.columns:
        Zdf[c] = obsDF[c].tolist()
    if mode in ['mean','avarage']:
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).mean()
    elif mode == 'sum':
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).sum()
    else:
        raise ValueError(f"mode {mode} not supported. Available mode are 'mean' or 'sum'")

    grpObs = pd.DataFrame(Zaggr.index.tolist(), columns=obsDF.columns.tolist())

    if len(obs_names_col) == 0:
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs, )
    elif all([c in obsDF.columns.tolist() for c in obs_names_col]):
        grpObs.index = grpObs[obs_names_col].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs)
    else:
        raise ValueError(f"Impossible to use {obs_names_col} as index. It's not present in obsDF")
    return grpAdata

### Parameters

In [None]:
batch_key_ref = batchKeys.split(',')
batch_key_ref

### Loading reference data

In [None]:
adataR = sc.read_h5ad(here('03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse.h5ad'))
adataR

### Preprocessing

In [None]:
sc.pp.normalize_total(adataR, target_sum=1e4)
sc.pp.log1p(adataR)

In [None]:
# We followed the tutorial from the python package (https://symphonypy.readthedocs.io/en/latest/usage.html)
sc.pp.scale(adataR)

In [None]:
sc.pp.pca(adataR, 
          n_comps=nPC, 
          use_highly_variable = False,
          zero_center=False)

### Integration with Harmony

In [None]:
sp.pp.harmony_integrate(adataR,
                        key=batch_key_ref, 
                        verbose = True, 
                        random_seed=random_seed,  
                        **{'max_iter_harmony':100}, # harmony_kwargs | should reach convergence before
                       )

### Save integrated object

In [None]:
adataR.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_{nPC}nPC_Batch_{batchKeys}.h5ad"), compression='gzip')

### Generating pseudobulk from latent space

In [None]:
emb_pseudobulk_train = aggregating_features(Z = adataR.obsm['X_pca_harmony'], 
                                            obsDF = adataR.obs[['sampleID','Level1','disease']], 
                                            mode = 'mean', obs_names_col=['sampleID','Level1'])

In [None]:
emb_pseudobulk_train.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_PSEUDOBULK_{nPC}nPC_Batch_{batchKeys}.h5ad"), compression='gzip')