In [4]:
import os
import sys
from pyprojroot.here import here

import pandas as pd
import anndata as ad
import numpy as np
import math
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

import optuna

import joblib
import pickle
import datetime

import collections

import xgboost
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as ssp
import joblib

from tqdm.auto import tqdm

from dotenv import load_dotenv

In [5]:
assert load_dotenv()

In [6]:
!pwd

/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis


# LOAD DATASET

In [7]:
# we are loading one dataset only to extract the spectra genes
adata_int_ct = ad.read_h5ad(
    here(f'03_downstream_analysis/08_gene_importance/data/pDC_adataMerged_SPECTRAgenes.log1p.h5ad'),
    backed='r',
    chunk_size=25000
)

adata_unint = ad.read_h5ad(
    here('03_downstream_analysis/08_gene_importance/data/04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad'),
    backed='r',
    chunk_size=25000
)

In [8]:
var_names_mask = adata_unint.var_names.isin(adata_int_ct.var_names).nonzero()[0]

In [9]:
adata_unint = adata_unint[(adata_unint.obs.studyID=='COMBAT2022'), var_names_mask].to_memory()
adata_unint.shape

(583184, 935)

#### Loading further metadata

In [10]:
with open(here("01_data_processing/results/extended_obs/COMBAT2022_obs.pkl"), 'rb') as file:
    COMBATmetadata = pickle.load(file)

# Subset columns of interest
COMBATmetadata = COMBATmetadata[["batch", "COVID_severity"]]
COMBATmetadata

Unnamed: 0_level_0,batch,COVID_severity
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1
COMBAT2022_L000_S00109_T0_AAACCTGAGAAAGTGG,gPlexA,COVID_SEV
COMBAT2022_L000_S00112_T0_AAACCTGAGCGGATCA,gPlexA,COVID_MILD
COMBAT2022_L000_G05153_T0_AAACCTGAGGACATTA,gPlexA,COVID_HCW_MILD
COMBAT2022_L000_S00005_T1_AAACCTGAGGCGACAT,gPlexA,COVID_CRIT
COMBAT2022_L000_S00061_T0_AAACCTGAGGGAACGG,gPlexA,COVID_SEV
...,...,...
COMBAT2022_L069_H00064_T0_TTTGTCAGTGGCAAAC,gPlexK,HV
COMBAT2022_L069_U00501_T0_TTTGTCAGTTACCGAT,gPlexK,Flu
COMBAT2022_L069_G05112_T0_TTTGTCATCCTCTAGC,gPlexK,COVID_HCW_MILD
COMBAT2022_L069_S00045_T0_TTTGTCATCGAGGTAG,gPlexK,COVID_SEV


In [11]:
adata_unint.obs = adata_unint.obs.merge(COMBATmetadata, left_index=True, right_index=True, how='left')
adata_unint.obs.head()

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,disease,sex,binned_age,Level1,Level2,batch,COVID_severity
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
COMBAT2022_L000_S00112_T0_AAACCTGAGCGGATCA,COMBAT2022,COMBAT2022_L000,COMBAT2022_S00112_T0,5_GEX_V1,COVID,female,51-60,T_CD4_NonNaive,T_CD4_EMRA,gPlexA,COVID_MILD
COMBAT2022_L000_G05153_T0_AAACCTGAGGACATTA,COMBAT2022,COMBAT2022_L000,COMBAT2022_G05153_T0,5_GEX_V1,COVID,male,41-50,B,B_Naive,gPlexA,COVID_HCW_MILD
COMBAT2022_L000_S00061_T0_AAACCTGAGGGAACGG,COMBAT2022,COMBAT2022_L000,COMBAT2022_S00061_T0,5_GEX_V1,COVID,female,51-60,Mono,Mono_IFNresponse,gPlexA,COVID_SEV
COMBAT2022_L000_S00056_T0_AAACCTGCACATGTGT,COMBAT2022,COMBAT2022_L000,COMBAT2022_S00056_T0,5_GEX_V1,COVID,male,71-80,Mono,Mono_classical,gPlexA,COVID_SEV
COMBAT2022_L000_H00067_T0_AAACCTGCATCTATGG,COMBAT2022,COMBAT2022_L000,COMBAT2022_H00067_T0,5_GEX_V1,healthy,male,71-80,T_CD8_NonNaive,T_CD8_CM,gPlexA,HV


In [12]:
adata_unint.obs['disease'] = adata_unint.obs.COVID_severity.astype(str).replace({'HV':'healthy', 'COVID_HCW_MILD':'COVID_MILD', 'Sepsis':'sepsis'}).astype('category')

In [13]:
adata_unint.obs[['batch', 'disease', 'sampleID']].drop_duplicates().groupby(['batch','disease'], observed=True).size().reset_index(name='count')

Unnamed: 0,batch,disease,count
0,gPlexA,COVID_CRIT,1
1,gPlexA,COVID_MILD,3
2,gPlexA,COVID_SEV,3
3,gPlexA,Flu,1
4,gPlexA,healthy,1
5,gPlexA,sepsis,1
6,gPlexB,COVID_CRIT,1
7,gPlexB,COVID_MILD,2
8,gPlexB,COVID_SEV,4
9,gPlexB,Flu,1


### Removing sepsis

In [14]:
adata_unint = adata_unint[adata_unint.obs.disease!='sepsis']
adata_unint.obs.disease.unique().tolist()

['COVID_MILD', 'COVID_SEV', 'healthy', 'Flu', 'COVID_CRIT']

### Saving one object for each cell-types

In [15]:
for ct in tqdm(['T_CD4_NonNaive','B','Mono','T_CD8_NonNaive','T_CD4_Naive','UTC','ILC','T_CD8_Naive','Plasma','DC','pDC']):
    adata_ct = adata_unint[adata_unint.obs.Level1 == ct]
    adata_ct.write(here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/data/{ct}_COMBAT2022_spectraGenes_log1p.h5ad'))

  0%|          | 0/11 [00:00<?, ?it/s]