In [1]:
# Parameters
nPC = 200
batchKeys = "chemistry"
queryDataset = "EXTERNAL"


In [2]:
for v in ['nPC','batchKeys', 'queryDataset']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

nPC = 200
batchKeys = chemistry
queryDataset = EXTERNAL


In [3]:
import os

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from sklearn.metrics import balanced_accuracy_score

from pyprojroot import here

random_seed = 42

### Defining functions

In [4]:
def train_classifier(adataTrain = None, cell_type_col = None, y_true_col = None, max_iter=10000,random_state = 25, model = 'LinearSVC'):
    
    import pandas as pd
    from tqdm import tqdm
    from sklearn.svm import LinearSVC, SVC

    from sklearn.metrics import balanced_accuracy_score
    
    clfList = dict()
    trainAccuracy = []
    for ct_ in tqdm(adataTrain.obs[cell_type_col].unique()):
        clfList[ct_] = dict()
        X_i = adataTrain.X[adataTrain.obs[cell_type_col] == ct_]
        y_true_i = adataTrain.obs[y_true_col][adataTrain.obs[cell_type_col] == ct_]

        if mode == 'LinearSVC':
            clfList[ct_]['clf'] = LinearSVC(max_iter=max_iter, dual = True, random_state = random_state).fit(X_i, y_true_i)
        elif mode == 'SVC':
            clfList[ct_]['clf'] = SVC(max_iter=max_iter, random_state = random_state).fit(X_i, y_true_i)
            
        clfList[ct_]['bAcc'] = balanced_accuracy_score(y_true = y_true_i, y_pred = clfList[ct_]['clf'].predict(X_i))
        clfList[ct_]['nObs'] = len(y_true_i)

    return clfList
    
def vote_disease(adataTest = None, clfList = None, cell_type_col = None, sample_id_col = None):

    from tqdm import tqdm
    import pandas as pd

    
    
    classificationDF = pd.DataFrame()
    for ct_ in tqdm(adataTest.obs[cell_type_col].unique()):
        X_i = adataTest.X[adataTest.obs[cell_type_col] == ct_]
        PID_i = adataTest.obs[sample_id_col][adataTest.obs[cell_type_col] == ct_]
        if ct_ not in clfList:
            print(f"{ct_} is missing in training set")
            continue
        DF_i = pd.DataFrame.from_dict({
            sample_id_col: PID_i,
            f"{ct_}_prediction": clfList[ct_]['clf'].predict(X_i)
        })
        if classificationDF.shape[0] == 0:
            classificationDF = DF_i
        else:
            classificationDF = classificationDF.merge(DF_i, how='outer', on = sample_id_col)
    classificationDF['firstChoice'] = ''
    classificationDF['firstChoice_perc'] = np.nan
    classificationDF['secondChoice'] = ''
    classificationDF['secondChoice_perc'] = np.nan
    
    for i in tqdm(range(classificationDF.shape[0])):
        vote_i = classificationDF.loc[i,classificationDF.columns !=sample_id_col].value_counts()
        vote_i /= (vote_i.sum() / 100)
        res_i = vote_i.sort_values(ascending=False)
        classificationDF.loc[i,'firstChoice'] = res_i.index[0]
        classificationDF.loc[i,'firstChoice_perc'] = res_i.iloc[0]
        if res_i.shape[0] > 1:
            classificationDF.loc[i,'secondChoice'] = res_i.index[1]
            classificationDF.loc[i,'secondChoice_perc'] = res_i.iloc[1]

    return classificationDF
    
def aggregating_features(Z = None, obsDF = None, mode = 'mean', obs_names_col = [], min_observation = 0):
    Zdf = pd.DataFrame(Z)
    for c in obsDF.columns:
        Zdf[c] = obsDF[c].tolist()

    grpDF = Zdf.groupby(obsDF.columns.tolist(), observed = True)

    nCount = grpDF.size().to_frame('n_observation')
    
    if mode in ['mean','avarage']:
        Zaggr = grpDF.mean()
    elif mode == 'sum':
        Zaggr = grpDF.sum()
    else:
        raise ValueError(f"mode {mode} not supported. Available mode are 'mean' or 'sum'")

    grpObs = pd.DataFrame(Zaggr.index.tolist(), columns=obsDF.columns.tolist()).merge(pd.DataFrame(nCount).reset_index(), on = obsDF.columns.tolist())

    if len(obs_names_col) == 0:
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs)
    elif all([c in obsDF.columns.tolist() for c in obs_names_col]):
        grpObs.index = grpObs[obs_names_col].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs)
    else:
        raise ValueError(f"Impossible to use {obs_names_col} as index. It's not present in obsDF")

    if min_observation > 0:
        grpAdata = grpAdata[grpAdata.obs.n_observation >= min_observation]
    return grpAdata

### Parameters

In [5]:
batch_key_query = batchKeys.split(',')
batch_key_query

['chemistry']

### Loading dataset

**Integrated object**

In [6]:
adataR = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_{nPC}nPC_Batch_{batchKeys}.h5ad"), backed='r')
adataR

AnnData object with n_obs × n_vars = 4435922 × 8253 backed at '/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_200nPC_Batch_chemistry.h5ad'
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable', 'mean', 'std'
    uns: 'harmony', 'log1p', 'pca'
    obsm: 'X_pca', 'X_pca_harmony'
    varm: 'PCs'

**Query dataset**

In [7]:
adataQ = sc.read_h5ad(here(f'03_downstream_analysis/02_gene_universe_definition/results/05_{queryDataset}_geneUniverse.h5ad'))
adataQ

AnnData object with n_obs × n_vars = 572872 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

### Preprocessing

In [8]:
sc.pp.normalize_total(adataQ, target_sum=1e4)
sc.pp.log1p(adataQ)

### Projection with Symphony

In [9]:
sp.tl.map_embedding(adataQ, adataR, key=batch_key_query)

In [10]:
adataQ

AnnData object with n_obs × n_vars = 572872 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'
    obsm: 'X_pca_reference', 'X_pca_harmony', 'X_pca_harmony_symphony_R'

### Label transfer

In [11]:
# common embedded space -> adataQ.obsm['X_pca_harmony']

**Level1 Annotation**

In [12]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'Level1', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

**Disease**

In [13]:
adataQ.obs.rename({'disease':'disease_true'}, axis=1, inplace=True)

In [14]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'disease', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

In [15]:
adataQ.obs.rename({'disease':'disease_pred'}, axis=1, inplace=True)

In [16]:
balanced_accuracy_score(y_true = adataQ.obs.disease_true, y_pred=adataQ.obs.disease_pred)



0.16848594069403083

In [17]:
adataQ.obs

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,technology,disease_true,sex,binned_age,Level1,disease_pred
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SCGT00val_L006_I036016_T0_AAACCCAAGACAACTA,SCGT00val,SCGT00val_L006,SCGT00val_I036016_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,31-40,T_CD4_Naive,HIV
SCGT00val_L006_I036023_T0_AAACCCAAGACGGAAA,SCGT00val,SCGT00val_L006,SCGT00val_I036023_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,51-60,T_CD4_NonNaive,SLE
SCGT00val_L006_I036016_T0_AAACCCAAGAGCATTA,SCGT00val,SCGT00val_L006,SCGT00val_I036016_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,31-40,T_CD4_Naive,HIV
SCGT00val_L006_I036021_T0_AAACCCAAGATGGTCG,SCGT00val,SCGT00val_L006,SCGT00val_I036021_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,61-70,T_CD8_NonNaive,SLE
SCGT00val_L006_I036021_T0_AAACCCAAGCAACAAT,SCGT00val,SCGT00val_L006,SCGT00val_I036021_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,61-70,DC,UC
...,...,...,...,...,...,...,...,...,...,...
10XGenomics_L008_10XHC8_T0_TTTGTCATCCACGTTC,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,B,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCCCGACTT,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCGTGGGAA,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCTCAAACG,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy


### Generating pseudobulk from latent space for Q dataset

In [18]:
emb_pseudobulk_Q = aggregating_features(Z = adataQ.obsm['X_pca_harmony'], obsDF = adataQ.obs[['sampleID','Level1','disease_true']], mode = 'mean', obs_names_col=['sampleID','Level1'])

In [19]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
10XGenomics_10XHC1_T0_B,10XGenomics_10XHC1_T0,B,healthy,977
10XGenomics_10XHC1_T0_Cycling_cells,10XGenomics_10XHC1_T0,Cycling_cells,healthy,20
10XGenomics_10XHC1_T0_DC,10XGenomics_10XHC1_T0,DC,healthy,130
10XGenomics_10XHC1_T0_ILC,10XGenomics_10XHC1_T0,ILC,healthy,317
10XGenomics_10XHC1_T0_Mono,10XGenomics_10XHC1_T0,Mono,healthy,1462
...,...,...,...,...
Savage2021_PIDB_T0_T_CD4_NonNaive,Savage2021_PIDB_T0,T_CD4_NonNaive,healthy,1799
Savage2021_PIDB_T0_T_CD8_Naive,Savage2021_PIDB_T0,T_CD8_Naive,healthy,373
Savage2021_PIDB_T0_T_CD8_NonNaive,Savage2021_PIDB_T0,T_CD8_NonNaive,healthy,2397
Savage2021_PIDB_T0_UTC,Savage2021_PIDB_T0,UTC,healthy,168


**Loading training pseudobulk**

In [20]:
emb_pseudobulk_train = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_PSEUDOBULK_{nPC}nPC_Batch_{batchKeys}.h5ad"))

#### Remove unwanted cell types

In [21]:
emb_pseudobulk_train = emb_pseudobulk_train[~emb_pseudobulk_train.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

In [22]:
emb_pseudobulk_Q = emb_pseudobulk_Q[~emb_pseudobulk_Q.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

### Saving query dataset and pseudobulks

In [23]:
adataQ.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

In [24]:
emb_pseudobulk_Q.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_PSEUDOBULK{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

  df[key] = c
  df[key] = c
  df[key] = c


### Training linear classifiers

In [25]:
def train_classifier(adataTrain = None, cell_type_col = None, y_true_col = None, max_iter=10000,random_state = 25, model = 'LinearSVC', kargs_model = None):
    
    import pandas as pd
    from tqdm import tqdm
    from sklearn.svm import LinearSVC, SVC
    from sklearn.neighbors import KNeighborsClassifier

    from sklearn.metrics import balanced_accuracy_score
    
    clfList = dict()
    trainAccuracy = []
    for ct_ in tqdm(adataTrain.obs[cell_type_col].unique()):
        clfList[ct_] = dict()
        X_i = adataTrain.X[adataTrain.obs[cell_type_col] == ct_]
        y_true_i = adataTrain.obs[y_true_col][adataTrain.obs[cell_type_col] == ct_]

        if model == 'LinearSVC':
            clfList[ct_]['clf'] = LinearSVC(**kargs_model).fit(X_i, y_true_i) # max_iter=max_iter, dual = True, random_state = random_state
        elif model == 'SVC':
            clfList[ct_]['clf'] = SVC(**kargs_model).fit(X_i, y_true_i) # max_iter=max_iter, random_state = random_state
        elif model == 'KNeighborsClassifier':
            clfList[ct_]['clf'] = KNeighborsClassifier(**kargs_model).fit(X_i, y_true_i) # n_neighbors = 5, weights='distance', n_jobs = -1
            
            
        clfList[ct_]['bAcc'] = balanced_accuracy_score(y_true = y_true_i, y_pred = clfList[ct_]['clf'].predict(X_i))
        clfList[ct_]['nObs'] = len(y_true_i)

    return clfList
    
def vote_disease(adataTest = None, clfList = None, cell_type_col = None, sample_id_col = None):

    from tqdm import tqdm
    import pandas as pd
    
    classificationDF = pd.DataFrame()
    for ct_ in tqdm(adataTest.obs[cell_type_col].unique()):
        X_i = adataTest.X[adataTest.obs[cell_type_col] == ct_]
        PID_i = adataTest.obs[sample_id_col][adataTest.obs[cell_type_col] == ct_]
        if ct_ not in clfList:
            print(f"{ct_} is missing in training set")
            continue
        DF_i = pd.DataFrame.from_dict({
            sample_id_col: PID_i,
            f"{ct_}_prediction": clfList[ct_]['clf'].predict(X_i)
        })
        if classificationDF.shape[0] == 0:
            classificationDF = DF_i
        else:
            classificationDF = classificationDF.merge(DF_i, how='outer', on = sample_id_col)
    classificationDF['firstChoice'] = ''
    classificationDF['firstChoice_perc'] = np.nan
    classificationDF['secondChoice'] = ''
    classificationDF['secondChoice_perc'] = np.nan
    
    for i in tqdm(range(classificationDF.shape[0])):
        vote_i = classificationDF.loc[i,classificationDF.columns !=sample_id_col].value_counts()
        vote_i /= (vote_i.sum() / 100)
        res_i = vote_i.sort_values(ascending=False)
        classificationDF.loc[i,'firstChoice'] = res_i.index[0]
        classificationDF.loc[i,'firstChoice_perc'] = res_i.iloc[0]
        if res_i.shape[0] > 1:
            classificationDF.loc[i,'secondChoice'] = res_i.index[1]
            classificationDF.loc[i,'secondChoice_perc'] = res_i.iloc[1]

    return classificationDF
    
def aggregating_features(Z = None, obsDF = None, mode = 'mean', obs_names_col = []):
    Zdf = pd.DataFrame(Z)
    for c in obsDF.columns:
        Zdf[c] = obsDF[c].tolist()
    if mode in ['mean','avarage']:
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).mean()
    elif mode == 'sum':
        Zaggr = Zdf.groupby(obsDF.columns.tolist(), observed = True).sum()
    else:
        raise ValueError(f"mode {mode} not supported. Available mode are 'mean' or 'sum'")

    grpObs = pd.DataFrame(Zaggr.index.tolist(), columns=obsDF.columns.tolist())

    if len(obs_names_col) == 0:
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs, )
    elif all([c in obsDF.columns.tolist() for c in obs_names_col]):
        grpObs.index = grpObs[obs_names_col].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
        grpAdata  = sc.AnnData(X = np.array(Zaggr), obs = grpObs)
    else:
        raise ValueError(f"Impossible to use {obs_names_col} as index. It's not present in obsDF")
    return grpAdata

In [26]:
clfList = train_classifier(adataTrain = emb_pseudobulk_train, cell_type_col = 'Level1', y_true_col = 'disease', max_iter=-1, random_state = 25, model = 'KNeighborsClassifier', 
                           kargs_model={'n_neighbors':2, 'weights':'uniform', 'metric':'cosine','n_jobs':-1})

  0%|          | 0/11 [00:00<?, ?it/s]

 18%|█▊        | 2/11 [00:00<00:00, 11.60it/s]

 36%|███▋      | 4/11 [00:00<00:00, 12.47it/s]

 55%|█████▍    | 6/11 [00:00<00:00, 11.82it/s]

 73%|███████▎  | 8/11 [00:00<00:00, 11.98it/s]

 91%|█████████ | 10/11 [00:00<00:00, 12.30it/s]

100%|██████████| 11/11 [00:00<00:00, 12.28it/s]




In [27]:
clfList

{'B': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.9122029870714081,
  'nObs': 814},
 'DC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8973075162049755,
  'nObs': 801},
 'ILC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8920130294917173,
  'nObs': 817},
 'Mono': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.885445668497003,
  'nObs': 817},
 'Plasma': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8165221332468878,
  'nObs': 727},
 'T_CD4_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.9304880618928062,
  'nObs': 814},
 'T_CD4_NonNaive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.9376140213968345,
  'nObs': 816},
 'T_CD8_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  '

### Predicting diseases

In [28]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
10XGenomics_10XHC1_T0_B,10XGenomics_10XHC1_T0,B,healthy,977
10XGenomics_10XHC1_T0_DC,10XGenomics_10XHC1_T0,DC,healthy,130
10XGenomics_10XHC1_T0_ILC,10XGenomics_10XHC1_T0,ILC,healthy,317
10XGenomics_10XHC1_T0_Mono,10XGenomics_10XHC1_T0,Mono,healthy,1462
10XGenomics_10XHC1_T0_Plasma,10XGenomics_10XHC1_T0,Plasma,healthy,4
...,...,...,...,...
Savage2021_PIDB_T0_T_CD4_NonNaive,Savage2021_PIDB_T0,T_CD4_NonNaive,healthy,1799
Savage2021_PIDB_T0_T_CD8_Naive,Savage2021_PIDB_T0,T_CD8_Naive,healthy,373
Savage2021_PIDB_T0_T_CD8_NonNaive,Savage2021_PIDB_T0,T_CD8_NonNaive,healthy,2397
Savage2021_PIDB_T0_UTC,Savage2021_PIDB_T0,UTC,healthy,168


In [29]:
classificationDF = vote_disease(adataTest = emb_pseudobulk_Q, 
                                clfList = clfList, 
                                sample_id_col = 'sampleID',
                                cell_type_col = 'Level1')


  0%|          | 0/11 [00:00<?, ?it/s]

 36%|███▋      | 4/11 [00:00<00:00, 35.39it/s]

 82%|████████▏ | 9/11 [00:00<00:00, 41.09it/s]

100%|██████████| 11/11 [00:00<00:00, 38.51it/s]




  0%|          | 0/86 [00:00<?, ?it/s]

 90%|████████▉ | 77/86 [00:00<00:00, 766.69it/s]

100%|██████████| 86/86 [00:00<00:00, 762.18it/s]




In [30]:
res = classificationDF.merge(emb_pseudobulk_Q.obs[['sampleID', 'disease_true']].drop_duplicates(), how = 'left', on = 'sampleID').set_index('sampleID')


In [31]:
balanced_accuracy_score(y_true = res.disease_true, y_pred=res.firstChoice)



0.23314950980392157

In [32]:
res

Unnamed: 0_level_0,B_prediction,DC_prediction,ILC_prediction,Mono_prediction,Plasma_prediction,T_CD4_Naive_prediction,T_CD4_NonNaive_prediction,T_CD8_Naive_prediction,T_CD8_NonNaive_prediction,UTC_prediction,pDC_prediction,firstChoice,firstChoice_perc,secondChoice,secondChoice_perc,disease_true
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10XGenomics_10XHC1_T0,NPC,healthy,healthy,SLE,NPC,healthy,healthy,NPC,healthy,healthy,healthy,healthy,53.846154,NPC,23.076923,healthy
10XGenomics_10XHC2_T0,NPC,healthy,NPC,healthy,COPD,healthy,healthy,NPC,healthy,healthy,healthy,healthy,53.846154,NPC,23.076923,healthy
10XGenomics_10XHC3_T0,healthy,healthy,healthy,HIV,SLE,healthy,healthy,NPC,NPC,healthy,healthy,healthy,53.846154,NPC,15.384615,healthy
10XGenomics_10XHC4_T0,SLE,healthy,healthy,SLE,SLE,healthy,healthy,healthy,healthy,healthy,healthy,healthy,61.538462,SLE,23.076923,healthy
10XGenomics_10XHC5_T0,SLE,healthy,healthy,SLE,SLE,healthy,healthy,healthy,healthy,healthy,healthy,healthy,61.538462,SLE,23.076923,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Savage2021_BRISL5_T0,SLE,SLE,HNSCC,SLE,COVID,SLE,COVID,SLE,COVID,SLE,healthy,SLE,46.153846,COVID,23.076923,SLE
Savage2021_BRISL6_T0,CD,healthy,HNSCC,healthy,COVID,healthy,healthy,healthy,healthy,healthy,healthy,healthy,61.538462,,15.384615,SLE
Savage2021_BRISL7_T0,SLE,healthy,NPC,HNSCC,SLE,healthy,healthy,healthy,COVID,COVID,healthy,healthy,38.461538,SLE,15.384615,SLE
Savage2021_PIDA_T0,healthy,healthy,healthy,healthy,COVID,healthy,healthy,HIV,healthy,healthy,healthy,healthy,69.230769,,15.384615,healthy
