In [1]:

import numpy as np
import scanpy as sc
import pandas as pd

from pyprojroot.here import here


# Load data

### Selected gene list

In [4]:
SelectedGeneList = pd.read_pickle(here('03_downstream_analysis/02_gene_universe_definition/results/04_selected_gene_list.pkl'))
SelectedGeneList

Unnamed: 0_level_0,hgnc_id,symbol,locus_group,HUGO_status
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSG00000000003,HGNC:11858,TSPAN6,protein_coding,official
ENSG00000000457,HGNC:19285,SCYL3,protein_coding,official
ENSG00000000938,HGNC:3697,FGR,protein_coding,official
ENSG00000000971,HGNC:4883,CFH,protein_coding,official
ENSG00000001036,HGNC:4008,FUCA2,protein_coding,official
...,...,...,...,...
ENSG00000278817,,ENSG00000278817,protein_coding,non-official
ENSG00000278828,HGNC:4775,H3C10,protein_coding,official
ENSG00000280670,HGNC:27003,CCDC163,protein_coding,official
ENSG00000280789,HGNC:28707,PAGR1,protein_coding,official


## Datasets

Here we will prepare the datasets to run the integration steps by:

- Keeping only genes in our defined universe
- Removing not necessary observation

**Excluded**

In [3]:
adataExcluded = sc.read_h5ad(here('00_data_processing/results/02_INFLAMMATION_excluded_afterQC.h5ad'))

In [4]:
adataExcludedFilt = adataExcluded[:,adataExcluded.var.index.isin(SelectedGeneList.index)]
adataExcludedFilt.obs = adataExcludedFilt.obs[['studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age']]
adataExcludedFilt.var = adataExcludedFilt.var[['hgnc_id', 'symbol', 'locus_group', 'HUGO_status']]
adataExcludedFilt

AnnData object with n_obs × n_vars = 1357922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'chemistry_colors'

In [5]:
del adataExcludedFilt.uns

In [6]:
adataExcludedFilt

AnnData object with n_obs × n_vars = 1357922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [7]:
adataExcludedFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_EXCLUDED_geneUniverse.h5ad'), compression='gzip')

*log1p-normalization*

In [7]:
# normalization and log1p scale
sc.pp.normalize_total(adataExcludedFilt, target_sum=1e4)
sc.pp.log1p(adataExcludedFilt)

In [8]:
adataExcludedFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_EXCLUDED_geneUniverse.log1p.h5ad'), compression='gzip')

In [9]:
del adataExcludedFilt

**External**

In [5]:
adataExternal = sc.read_h5ad(here('00_data_processing/results/02_INFLAMMATION_external_afterQC.h5ad'))

In [11]:
adataExternal.obs.disease.unique().tolist()

['RA', 'healthy', 'COVID', 'HIV', 'cirrhosis', 'CD', 'SLE', 'sepsis']

In [None]:
adataExternalFilt = adataExternal[:,adataExternal.var.index.isin(SelectedGeneList.index)]
adataExternalFilt.obs = adataExternalFilt.obs[['studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age']]
adataExternalFilt.var = adataExternalFilt.var[['hgnc_id', 'symbol', 'locus_group', 'HUGO_status']]
adataExternalFilt

In [12]:
del adataExternalFilt.uns

In [13]:
adataExternalFilt

AnnData object with n_obs × n_vars = 572872 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [None]:
adataExternalFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_EXTERNAL_geneUniverse.h5ad'), compression='gzip')

*log1p-normalization*

In [14]:
# normalization and log1p scale
sc.pp.normalize_total(adataExternalFilt, target_sum=1e4)
sc.pp.log1p(adataExternalFilt)

In [15]:
adataExternalFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_EXTERNAL_geneUniverse.log1p.h5ad'), compression='gzip')

In [16]:
del adataExternalFilt

**Validation**

In [8]:
adataValidation = sc.read_h5ad(here('00_data_processing/results/02_INFLAMMATION_validation_afterQC.h5ad'))

In [9]:
adataValidation.obs.disease.unique().tolist()

['RA', 'PSA', 'CD', 'PS', 'HNSCC', 'healthy', 'SLE', 'COVID', 'sepsis']

In [18]:
adataValidationFilt = adataValidation[:,adataValidation.var.index.isin(SelectedGeneList.index)]
adataValidationFilt.obs = adataValidationFilt.obs[['studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age']]
adataValidationFilt.var = adataValidationFilt.var[['hgnc_id', 'symbol', 'locus_group', 'HUGO_status']]

In [19]:
del adataValidationFilt.uns

In [20]:
adataValidationFilt

AnnData object with n_obs × n_vars = 849922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

In [19]:
adataValidationFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_VALIDATION_geneUniverse.h5ad'), compression='gzip')

*log1p-normalization*

In [21]:
# normalization and log1p scale
sc.pp.normalize_total(adataValidationFilt, target_sum=1e4)
sc.pp.log1p(adataValidationFilt)

In [23]:
adataValidationFilt.write(here('03_downstream_analysis/02_gene_universe_definition/results/05_VALIDATION_geneUniverse.log1p.h5ad'), compression='gzip')