In [None]:
import sys
import os

import scanpy as sc
import pandas as pd

import decoupler

# Set random seed
random_seed = 5

from pyprojroot import here

from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

## Loading data

In [None]:
MAINdata = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad"))
MAINdata

In [None]:
EXTdata = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/05_EXTERNAL_geneUniverse.log1p.h5ad"))
EXTdata

In [None]:
VALdata = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/05_VALIDATION_geneUniverse.log1p.h5ad"))
VALdata

**Loading main object and projected dataset for retreiving predicted annotations**

In [None]:
EXTdata_projected = sc.read_h5ad(here(f"03_downstream_analysis/08_gene_importance/xgboost_external_validation/raw_data/scANVI_EXTERNAL_256_30_Level2_run1_finetuning.h5ad"))
EXTdata_projected

In [None]:
VALdata_projected = sc.read_h5ad(here(f"inflammabucket_bkp/03_downstream_analysis/PatientClassifier/scANVI/results/02_query/output/scANVI_VALIDATION_256_30_Level2_run1_finetuning.h5ad"))
VALdata_projected

In [None]:
MAINdataFULL = sc.read_h5ad(here("03_downstream_analysis/02_gene_universe_definition/results/04_MAIN_geneUniverse.h5ad"), backed='r')
MAINdataFULL

**Loading gene list**

In [None]:
selectedGenes = pd.read_pickle(here('03_downstream_analysis/05_SPECTRA/results/SPECTRAFactor_selected_genes.pkl'))
len(selectedGenes)

### Transfering annotation

In [None]:
annDF = MAINdataFULL.obs[['Level1','Level2']].drop_duplicates()

In [None]:
Lev1dict = dict(zip(annDF['Level2'], annDF['Level1']))

In [None]:
EXTdata_projected.obs['Level1pred'] = [Lev1dict[l] for l in EXTdata_projected.obs['labels']]
EXTdata_projected.obs

In [None]:
VALdata_projected.obs['Level1pred'] = [Lev1dict[l] for l in VALdata_projected.obs['labels']]
VALdata_projected.obs

In [None]:
EXTdata.obs = EXTdata.obs.merge(EXTdata_projected.obs[['Level1pred']], left_index=True, right_index=True)

In [None]:
VALdata.obs = VALdata.obs.merge(VALdata_projected.obs[['Level1pred']], left_index=True, right_index=True)

In [None]:
EXTdata.obs.disease.unique().tolist()

## Filtering
**Filtering cell-types**

Here, we are removing RBC, Platelets, Cycling_cell, and Progenitors

In [None]:
MAINdataCellFilt = MAINdata[~MAINdata.obs.Level1.isin(['Platelets','Cycling_cells','RBC','Progenitors'])]
MAINdataCellFilt

In [None]:
EXTdataCellFilt = EXTdata[~EXTdata.obs.Level1pred.isin(['Platelets','Cycling_cells','RBC','Progenitors'])]
EXTdataCellFilt

In [None]:
VALdataCellFilt = VALdata[~VALdata.obs.Level1pred.isin(['Platelets','Cycling_cells','RBC','Progenitors'])]
VALdataCellFilt

**Filtering genes**

Here, we keep only gene selected with Spectra

In [None]:
MAINdataCellFiltGeneSelected = MAINdataCellFilt[:,MAINdataCellFilt.var_names.isin(selectedGenes)]
MAINdataCellFiltGeneSelected

In [20]:
EXTdataCellFiltGeneSelected = EXTdataCellFilt[:,EXTdataCellFilt.var_names.isin(selectedGenes)]
EXTdataCellFiltGeneSelected

View of AnnData object with n_obs × n_vars = 562456 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age', 'Level1pred'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

In [21]:
VALdataCellFiltGeneSelected = VALdataCellFilt[:,VALdataCellFilt.var_names.isin(selectedGenes)]
VALdataCellFiltGeneSelected

View of AnnData object with n_obs × n_vars = 836230 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age', 'Level1pred'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

**Removing from Reference disease not present in External**

Here, we are focusing to validate genes selected with SHAP that are important to classify disease. In the query dataset (e.g., EXTERNAL), we only have a subset of diseases. Thus, we are removing the others from MAIN 

In [22]:
MAINdataFinal = MAINdataCellFiltGeneSelected[MAINdataCellFiltGeneSelected.obs.disease.isin(EXTdataCellFiltGeneSelected.obs.disease.unique().tolist())]
MAINdataFinal

View of AnnData object with n_obs × n_vars = 3400023 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
    uns: 'log1p'

In [23]:
VALdataFinal = VALdataCellFiltGeneSelected[VALdataCellFiltGeneSelected.obs.disease.isin(EXTdataCellFiltGeneSelected.obs.disease.unique().tolist())]
VALdataFinal

View of AnnData object with n_obs × n_vars = 756857 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age', 'Level1pred'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

In [24]:
EXTdataFinal = EXTdataCellFiltGeneSelected
EXTdataFinal

View of AnnData object with n_obs × n_vars = 562456 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age', 'Level1pred'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'

## Saving objects

**Splitted by cell types**

For xgboost training and validation, we need dataset splitted by cell-type

In [26]:
for ct_i in tqdm(MAINdataFinal.obs.Level1.unique()):
    ct_adata = MAINdataFinal[MAINdataFinal.obs.Level1 == ct_i]
    ct_adata.write(here(f"03_downstream_analysis/08_gene_importance/xgboost_external_validation/xgboost_TopN_genes/data_cellTypes/MAIN_{ct_i}.filtered.log1p.h5ad"), compression = 'gzip')

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [01:51<00:00, 10.16s/it]


In [27]:
for ct_i in tqdm(EXTdataFinal.obs.Level1pred.unique()):
    ct_adata = EXTdataFinal[EXTdataFinal.obs.Level1pred == ct_i]
    ct_adata.write(here(f"03_downstream_analysis/08_gene_importance/xgboost_external_validation/xgboost_TopN_genes/data_cellTypes/EXTERNAL_{ct_i}.filtered.log1p.h5ad"), compression = 'gzip')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:19<00:00,  1.79s/it]


In [28]:
for ct_i in tqdm(VALdataFinal.obs.Level1pred.unique()):
    ct_adata = VALdataFinal[VALdataFinal.obs.Level1pred == ct_i]
    ct_adata.write(here(f"03_downstream_analysis/08_gene_importance/xgboost_external_validation/xgboost_TopN_genes/data_cellTypes/VALIDATION_{ct_i}.filtered.log1p.h5ad"), compression = 'gzip')

  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
  df[key] = c
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:24<00:00,  2.27s/it]
