In [1]:
import os
import sys
from glob import glob

import scanpy as sc
import pandas as pd

import pynndescent
import numpy as np
import numba

from pyprojroot import here

sys.path.insert(1, str(here('bin')))
# Import custom functions
from customPythonFunctions import aggregating_features

### Loading data

In [2]:
adataR = sc.read_h5ad(here('09_patient_classifier/SCGT00_CentralizedDataset/results_batches/scANVI_SCGT00_MAIN_batches_latent.h5ad'))
adataQ = sc.read_h5ad(here('09_patient_classifier/SCGT00_CentralizedDataset/results_batches/scANVI_SCGT00_EXTERNAL_batches_latent.h5ad'))

adataQ.obs['Level1_pred'] = adataQ.obs['labels']
adataQ.obs.drop('labels',axis=1,inplace=True)

adataR, adataQ

(AnnData object with n_obs × n_vars = 756120 × 30
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'batches', 'Level1', '_scvi_batch', '_scvi_labels',
 AnnData object with n_obs × n_vars = 379359 × 30
     obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'patientID', 'disease', 'timepoint_replicate', 'treatmentStatus', 'therapyResponse', 'sex', 'age', 'BMI', 'binned_age', 'diseaseStatus', 'smokingStatus', 'ethnicity', 'institute', 'diseaseGroup', 'batches', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'total_counts_plt', 'log1p_total_counts_plt', 'pct_counts_plt', '_scvi_batch', 'Level1', '_scvi_labels', 'Level1_pred')

In [3]:
adataR.obs['Level1'].value_counts(normalize = True)

Level1
T_CD4_NonNaive    0.210553
Mono              0.173771
T_CD4_Naive       0.162826
T_CD8_NonNaive    0.123335
ILC               0.075446
B                 0.073889
Platelets         0.069396
T_CD8_Naive       0.053887
UTC               0.030974
DC                0.013807
pDC               0.004133
Cycling           0.003625
Progenitors       0.002269
Plasma            0.001829
RBC               0.000259
Name: proportion, dtype: float64

In [4]:
adataQ.obs['Level1_pred'].value_counts(normalize = True)

Level1_pred
Mono              0.208096
T_CD4_NonNaive    0.206783
T_CD4_Naive       0.180117
T_CD8_NonNaive    0.106791
ILC               0.083754
B                 0.078854
T_CD8_Naive       0.061807
UTC               0.028469
Platelets         0.023645
DC                0.012569
pDC               0.003962
Progenitors       0.001977
Cycling           0.001792
Plasma            0.001384
Name: proportion, dtype: float64

#### Generating PSEUDOBULKs

In [5]:
adataPB_R = aggregating_features(Z = adataR.X, 
                             obsDF = adataR.obs[['sampleID','batches','Level1','disease']], 
                             mode = 'mean', 
                             obs_names_col=['sampleID','batches','Level1'], 
                             min_observation=0)
adataPB_R

AnnData object with n_obs × n_vars = 2186 × 30
    obs: 'sampleID', 'batches', 'Level1', 'disease', 'n_observation'

**Considering scANVI predicted labels**

In [6]:
adataPB_Q_scANVI = aggregating_features(Z = adataQ.X, 
                             obsDF = adataQ.obs[['sampleID','batches','Level1_pred','disease']], 
                             mode = 'mean', 
                             obs_names_col=['sampleID','batches','Level1_pred'], 
                             min_observation=0)

adataPB_Q_scANVI.obs.rename({'Level1_pred':'Level1'}, axis=1, inplace=True)
adataPB_Q_scANVI

AnnData object with n_obs × n_vars = 776 × 30
    obs: 'sampleID', 'batches', 'Level1', 'disease', 'n_observation'

In [7]:
adataPB_Q_scANVI.obs.Level1.value_counts()

Level1
B                 56
DC                56
ILC               56
Mono              56
Platelets         56
Progenitors       56
T_CD4_Naive       56
T_CD4_NonNaive    56
T_CD8_Naive       56
T_CD8_NonNaive    56
UTC               56
Plasma            54
Cycling           53
pDC               53
Name: count, dtype: int64

### Saving pseudobulk adata objects

In [None]:
adataPB_R.write(here('03_downstream_analysis/09_patient_classifier/SCGT00_CentralizedDataset/results_batches/SCGT00_MAIN_pseudobulk.h5ad'), compression='gzip')

In [None]:
adataPB_Q_scANVI.write(here('03_downstream_analysis/09_patient_classifier/SCGT00_CentralizedDataset/results_batches/SCGT00_EXTERNAL_pseudobulk.h5ad'), compression='gzip')