In [1]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

from glob import glob

from scipy.sparse import csr_matrix

from pyprojroot import here

### Loading data

**Selected genes from SPECTRA factors**

In [8]:
selectedGenes = pd.read_pickle(here('03_downstream_analysis/05_SPECTRA/results/SPECTRAFactor_selected_genes.pkl'))
selectedGenes

['ENSG00000001167',
 'ENSG00000002549',
 'ENSG00000002586',
 'ENSG00000004468',
 'ENSG00000005339',
 'ENSG00000005844',
 'ENSG00000006125',
 'ENSG00000006210',
 'ENSG00000006327',
 'ENSG00000007264',
 'ENSG00000008394',
 'ENSG00000008517',
 'ENSG00000009790',
 'ENSG00000010327',
 'ENSG00000011422',
 'ENSG00000011590',
 'ENSG00000011600',
 'ENSG00000013297',
 'ENSG00000015475',
 'ENSG00000018280',
 'ENSG00000019169',
 'ENSG00000019582',
 'ENSG00000025708',
 'ENSG00000026025',
 'ENSG00000026103',
 'ENSG00000026297',
 'ENSG00000027697',
 'ENSG00000028137',
 'ENSG00000030110',
 'ENSG00000030582',
 'ENSG00000033800',
 'ENSG00000035115',
 'ENSG00000036448',
 'ENSG00000038427',
 'ENSG00000039068',
 'ENSG00000042753',
 'ENSG00000042980',
 'ENSG00000048462',
 'ENSG00000049130',
 'ENSG00000049249',
 'ENSG00000051108',
 'ENSG00000051382',
 'ENSG00000051523',
 'ENSG00000056736',
 'ENSG00000057657',
 'ENSG00000059728',
 'ENSG00000060982',
 'ENSG00000065665',
 'ENSG00000065978',
 'ENSG00000066136',


**ADATA object for each cell type**

In [13]:
adataPATHlist = glob(str(here('03_downstream_analysis/04_integration_with_annotation/results/normalized_adatas_nextflow/cellType_adata_merged/*_adataMerged.log1p.h5ad')))
len(adataPATHlist)

13

In [14]:
adataPATHlist = [p for p in adataPATHlist if not any(k in p for k in ['Progenitors','Cycling_cell'])]
len(adataPATHlist)

11

### Processing each object

Here we will 

1. load each merged object generated after get_normalized_expression with scANVI model.
2. subset object to keep only the 1020 genes selected from SPECTRA factors
3. convert the result matrix into a sparse format
4. write the object for using it as input of Gradient Boosting Machine and SHAP analysis

In [15]:
for FP_i in adataPATHlist:
    ct_i = os.path.basename(FP_i).split('_adataMerged')[0]
    print(f"Reading {ct_i} ... ", end='')
    adata_i = sc.read_h5ad(FP_i, backed = 'r')
    print(f"done", end='\n')
    adata_i = adata_i[:,selectedGenes]
    print(f"Convert to sparse ... ", end='')
    adata_sparse_i = sc.AnnData(X = csr_matrix(adata_i.X), 
                                obs = adata_i.obs, 
                                var = adata_i.var, 
                                uns = adata_i.uns)
    print(f"done", end='\n')
    print(adata_sparse_i, end ='\n')
    adata_sparse_i.write(here(f'03_downstream_analysis/08_gene_importance/data/{ct_i}_adataMerged_SPECTRAgenes.log1p.h5ad'))
    print(f"{ct_i} adata wrote", end ='\n\n')

Reading T_CD4_NonNaive ... done
Convert to sparse ... done
AnnData object with n_obs × n_vars = 712248 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    uns: 'log1p'
T_CD4_NonNaive adata wrote

Reading T_CD8_NonNaive ... done
Convert to sparse ... done
AnnData object with n_obs × n_vars = 523322 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    uns: 'log1p'
T_CD8_NonNaive adata wrote

Reading T_CD4_Naive ... done
Convert to sparse ... done
AnnData object with n_obs × n_vars = 792955 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    uns: 'log1p'
T_CD4_Naive adata wrote

Reading T_CD8_Naive ... done
Convert to sparse ... done
AnnData object with n_obs × n_vars = 213776 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
  