In [1]:
# Parameters
adataPATH = "03_downstream_analysis/08_PatientClassifier/scPoli/results/scPoli_ref_latents_EXTERNAL_2_100_100_run2_gubrri7h.h5ad"


In [2]:
for v in ['adataPATH']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

adataPATH = 03_downstream_analysis/08_PatientClassifier/scPoli/results/scPoli_ref_latents_EXTERNAL_2_100_100_run2_gubrri7h.h5ad


In [3]:
import os
import sys
from glob import glob

import scanpy as sc
import pandas as pd

from pyprojroot import here

sys.path.insert(1, str(here('bin')))
# Import custom functions
from customPythonFunctions import aggregating_features

### Loading data

In [4]:
adataPATHlist = glob(str(here(adataPATH)))
assert(len(adataPATHlist) == 1)

In [5]:
adata = sc.read_h5ad(adataPATHlist[0])
adata

AnnData object with n_obs × n_vars = 4435922 × 100
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    uns: 'chemistry', 'sampleID'

#### Generating PSEUDOBULKs

**Aggregating cell latent features**

In [6]:
adataPBcell = aggregating_features(Z = adata.X, 
                             obsDF = adata.obs[['sampleID','Level1','disease']], 
                             mode = 'mean', 
                             obs_names_col=['sampleID','Level1'], 
                             min_observation=0)
adataPBcell

AnnData object with n_obs × n_vars = 11372 × 100
    obs: 'sampleID', 'Level1', 'disease', 'n_observation'

**Extracting sample embedding**

In [7]:
adataPBsample = adata.uns['sampleID'].copy()
adataPBsample.obs['sampleID'] = adataPBsample.obs.index
adataPBsample = adataPBsample[adataPBsample.obs['sampleID'].isin(adataPBcell.obs['sampleID'])]

adataPBsample.obs = adataPBsample.obs.merge(adataPBcell.obs.groupby('sampleID').agg({'disease':'first'}), left_index=True, right_index=True)
adataPBsample.obs['Level1'] = 'sample'
adataPBsample

AnnData object with n_obs × n_vars = 817 × 100
    obs: 'sampleID', 'disease', 'Level1'

**Merging cell and patient latent features**

In [8]:
PBcellDF = adataPBcell.to_df()
PBcellDF['sampleID'] = adataPBcell.obs['sampleID']

In [9]:
PBsampleDF = adataPBsample.to_df()
PBsampleDF['sampleID'] = adataPBsample.obs['sampleID']

In [10]:
PBcell_sampleDF = PBcellDF.merge(PBsampleDF, on='sampleID', how='left')

In [11]:
assert(adataPBcell.obs['sampleID'].tolist() == PBcell_sampleDF['sampleID'].tolist())

In [12]:
PBcell_sampleDF.drop('sampleID', axis=1, inplace=True)

In [13]:
adataPBcell_sample = sc.AnnData(X = PBcell_sampleDF, obs = adataPBcell.obs)
adataPBcell_sample

AnnData object with n_obs × n_vars = 11372 × 200
    obs: 'sampleID', 'Level1', 'disease', 'n_observation'

### Saving pseudobulk adata objects

In [14]:
adataPBcell.write(here(adataPATHlist[0].replace('/scPoli_','/PSEUDOBULKs/scPoli_PSEUDOBULKcell_')), compression='gzip')

In [15]:
adataPBsample.write(here(adataPATHlist[0].replace('/scPoli_','/PSEUDOBULKs/scPoli_PSEUDOBULKsample_')), compression='gzip')

In [16]:
adataPBcell_sample.write(here(adataPATHlist[0].replace('/scPoli_','/PSEUDOBULKs/scPoli_PSEUDOBULKcellSample_')), compression='gzip')