In [1]:
# Parameters
nPC = 20
batchKeys = "chemistry"
queryDataset = "EXTERNAL"


In [2]:
for v in ['nPC','batchKeys', 'queryDataset']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

nPC = 20
batchKeys = chemistry
queryDataset = EXTERNAL


In [3]:
import os
import sys

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from sklearn.metrics import balanced_accuracy_score

from pyprojroot import here

sys.path.insert(1, str(here('bin')))
# Import custom functions
from customPythonFunctions import aggregating_features, train_patient_classifier, vote_patient_disease

random_seed = 42

### Parameters

In [4]:
batch_key_query = batchKeys.split(',')
batch_key_query

['chemistry']

### Loading dataset

**Integrated object**

In [5]:
adataR = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_{nPC}nPC_Batch_{batchKeys}.h5ad"), backed='r')
adataR

AnnData object with n_obs × n_vars = 4435922 × 8253 backed at '/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_20nPC_Batch_chemistry.h5ad'
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable', 'mean', 'std'
    uns: 'harmony', 'log1p', 'pca'
    obsm: 'X_pca', 'X_pca_harmony'
    varm: 'PCs'

**Query dataset**

In [6]:
adataQ = sc.read_h5ad(here(f'03_downstream_analysis/02_gene_universe_definition/results/05_{queryDataset}_geneUniverse.h5ad'))
adataQ

AnnData object with n_obs × n_vars = 572872 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

### Preprocessing

In [7]:
sc.pp.normalize_total(adataQ, target_sum=1e4)
sc.pp.log1p(adataQ)

### Projection with Symphony

In [8]:
sp.tl.map_embedding(adataQ, adataR, key=batch_key_query)

In [9]:
adataQ

AnnData object with n_obs × n_vars = 572872 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'
    obsm: 'X_pca_reference', 'X_pca_harmony', 'X_pca_harmony_symphony_R'

### Label transfer

In [10]:
# common embedded space -> adataQ.obsm['X_pca_harmony']

**Level1 Annotation**

In [11]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'Level1', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

**Disease**

In [12]:
adataQ.obs.rename({'disease':'disease_true'}, axis=1, inplace=True)

In [13]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'disease', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

In [14]:
adataQ.obs.rename({'disease':'disease_pred'}, axis=1, inplace=True)

In [15]:
balanced_accuracy_score(y_true = adataQ.obs.disease_true, y_pred=adataQ.obs.disease_pred)



0.154314427796604

In [16]:
adataQ.obs

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,technology,disease_true,sex,binned_age,Level1,disease_pred
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SCGT00val_L006_I036016_T0_AAACCCAAGACAACTA,SCGT00val,SCGT00val_L006,SCGT00val_I036016_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,31-40,T_CD4_Naive,HIV
SCGT00val_L006_I036023_T0_AAACCCAAGACGGAAA,SCGT00val,SCGT00val_L006,SCGT00val_I036023_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,51-60,T_CD4_NonNaive,RA
SCGT00val_L006_I036016_T0_AAACCCAAGAGCATTA,SCGT00val,SCGT00val_L006,SCGT00val_I036016_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,31-40,T_CD4_Naive,CD
SCGT00val_L006_I036021_T0_AAACCCAAGATGGTCG,SCGT00val,SCGT00val_L006,SCGT00val_I036021_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,61-70,T_CD8_NonNaive,healthy
SCGT00val_L006_I036021_T0_AAACCCAAGCAACAAT,SCGT00val,SCGT00val_L006,SCGT00val_I036021_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,61-70,DC,RA
...,...,...,...,...,...,...,...,...,...,...
10XGenomics_L008_10XHC8_T0_TTTGTCATCCACGTTC,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,B,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCCCGACTT,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCGTGGGAA,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy
10XGenomics_L008_10XHC8_T0_TTTGTCATCTCAAACG,10XGenomics,10XGenomics_L008,10XGenomics_10XHC8_T0,5_GEX_V2,5_GEX_V2,healthy,na,,T_CD4_Naive,healthy


### Generating pseudobulk from latent space for Q dataset

In [17]:
emb_pseudobulk_Q = aggregating_features(Z = adataQ.obsm['X_pca_harmony'], obsDF = adataQ.obs[['sampleID','Level1','disease_true']], mode = 'mean', obs_names_col=['sampleID','Level1'])

In [18]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
10XGenomics_10XHC1_T0_B,10XGenomics_10XHC1_T0,B,healthy,979
10XGenomics_10XHC1_T0_Cycling_cells,10XGenomics_10XHC1_T0,Cycling_cells,healthy,21
10XGenomics_10XHC1_T0_DC,10XGenomics_10XHC1_T0,DC,healthy,137
10XGenomics_10XHC1_T0_ILC,10XGenomics_10XHC1_T0,ILC,healthy,307
10XGenomics_10XHC1_T0_Mono,10XGenomics_10XHC1_T0,Mono,healthy,1453
...,...,...,...,...
Savage2021_PIDB_T0_T_CD4_NonNaive,Savage2021_PIDB_T0,T_CD4_NonNaive,healthy,1573
Savage2021_PIDB_T0_T_CD8_Naive,Savage2021_PIDB_T0,T_CD8_Naive,healthy,383
Savage2021_PIDB_T0_T_CD8_NonNaive,Savage2021_PIDB_T0,T_CD8_NonNaive,healthy,2570
Savage2021_PIDB_T0_UTC,Savage2021_PIDB_T0,UTC,healthy,191


**Loading training pseudobulk**

In [19]:
emb_pseudobulk_train = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_PSEUDOBULK_{nPC}nPC_Batch_{batchKeys}.h5ad"))

#### Remove unwanted cell types

In [20]:
emb_pseudobulk_train = emb_pseudobulk_train[~emb_pseudobulk_train.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

In [21]:
emb_pseudobulk_Q = emb_pseudobulk_Q[~emb_pseudobulk_Q.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

### Saving query dataset and pseudobulks

In [22]:
adataQ.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

In [23]:
emb_pseudobulk_Q.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_PSEUDOBULK{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

  df[key] = c
  df[key] = c
  df[key] = c


### Training linear classifiers

In [24]:
clfList = train_patient_classifier(adataTrain = emb_pseudobulk_train, cell_type_col = 'Level1', y_true_col = 'disease', max_iter=-1, random_state = 25, model = 'KNeighborsClassifier', 
                           kargs_model={'n_neighbors':2, 'weights':'uniform', 'metric':'cosine','n_jobs':-1})

  0%|          | 0/11 [00:00<?, ?it/s]

 18%|█▊        | 2/11 [00:00<00:00,  9.67it/s]

 36%|███▋      | 4/11 [00:00<00:01,  4.11it/s]

 55%|█████▍    | 6/11 [00:00<00:00,  6.44it/s]

 73%|███████▎  | 8/11 [00:01<00:00,  8.74it/s]

 91%|█████████ | 10/11 [00:01<00:00, 10.94it/s]

100%|██████████| 11/11 [00:01<00:00,  8.84it/s]




In [25]:
clfList

{'B': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.893909913186229,
  'nObs': 814},
 'DC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8484375597011713,
  'nObs': 801},
 'ILC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8243337202566261,
  'nObs': 817},
 'Mono': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8433445704146223,
  'nObs': 817},
 'Plasma': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.811216097575144,
  'nObs': 727},
 'T_CD4_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.880433455562023,
  'nObs': 814},
 'T_CD4_NonNaive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8790256347104011,
  'nObs': 816},
 'T_CD8_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bA

### Predicting diseases

In [26]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
10XGenomics_10XHC1_T0_B,10XGenomics_10XHC1_T0,B,healthy,979
10XGenomics_10XHC1_T0_DC,10XGenomics_10XHC1_T0,DC,healthy,137
10XGenomics_10XHC1_T0_ILC,10XGenomics_10XHC1_T0,ILC,healthy,307
10XGenomics_10XHC1_T0_Mono,10XGenomics_10XHC1_T0,Mono,healthy,1453
10XGenomics_10XHC1_T0_Plasma,10XGenomics_10XHC1_T0,Plasma,healthy,4
...,...,...,...,...
Savage2021_PIDB_T0_T_CD4_NonNaive,Savage2021_PIDB_T0,T_CD4_NonNaive,healthy,1573
Savage2021_PIDB_T0_T_CD8_Naive,Savage2021_PIDB_T0,T_CD8_Naive,healthy,383
Savage2021_PIDB_T0_T_CD8_NonNaive,Savage2021_PIDB_T0,T_CD8_NonNaive,healthy,2570
Savage2021_PIDB_T0_UTC,Savage2021_PIDB_T0,UTC,healthy,191


In [27]:
classificationDF = vote_patient_disease(adataTest = emb_pseudobulk_Q, 
                                clfList = clfList, 
                                sample_id_col = 'sampleID',
                                cell_type_col = 'Level1')


  0%|          | 0/11 [00:00<?, ?it/s]

 55%|█████▍    | 6/11 [00:00<00:00, 52.99it/s]

100%|██████████| 11/11 [00:00<00:00, 52.96it/s]




  0%|          | 0/86 [00:00<?, ?it/s]

100%|██████████| 86/86 [00:00<00:00, 942.93it/s]




In [28]:
res = classificationDF.merge(emb_pseudobulk_Q.obs[['sampleID', 'disease_true']].drop_duplicates(), how = 'left', on = 'sampleID').set_index('sampleID')


In [29]:
balanced_accuracy_score(y_true = res.disease_true, y_pred=res.firstChoice)



0.18903186274509803

In [30]:
res

Unnamed: 0_level_0,B_prediction,DC_prediction,ILC_prediction,Mono_prediction,Plasma_prediction,T_CD4_Naive_prediction,T_CD4_NonNaive_prediction,T_CD8_Naive_prediction,T_CD8_NonNaive_prediction,UTC_prediction,pDC_prediction,firstChoice,firstChoice_perc,secondChoice,secondChoice_perc,disease_true
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
10XGenomics_10XHC1_T0,NPC,healthy,HNSCC,HNSCC,NPC,NPC,NPC,NPC,healthy,NPC,healthy,NPC,46.153846,healthy,23.076923,healthy
10XGenomics_10XHC2_T0,NPC,healthy,NPC,HNSCC,COVID,NPC,NPC,NPC,healthy,HBV,healthy,NPC,38.461538,healthy,23.076923,healthy
10XGenomics_10XHC3_T0,NPC,healthy,healthy,SLE,COVID,NPC,HBV,NPC,healthy,NPC,healthy,NPC,30.769231,healthy,30.769231,healthy
10XGenomics_10XHC4_T0,NPC,SLE,healthy,SLE,SLE,healthy,healthy,healthy,NPC,NPC,healthy,healthy,38.461538,NPC,23.076923,healthy
10XGenomics_10XHC5_T0,HNSCC,SLE,COPD,SLE,SLE,healthy,healthy,healthy,healthy,healthy,healthy,healthy,46.153846,SLE,23.076923,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Savage2021_BRISL5_T0,HNSCC,SLE,HNSCC,SLE,HNSCC,healthy,NPC,NPC,COVID,HNSCC,COVID,HNSCC,30.769231,SLE,15.384615,SLE
Savage2021_BRISL6_T0,HNSCC,flu,HNSCC,healthy,HNSCC,HNSCC,HNSCC,healthy,healthy,HNSCC,healthy,HNSCC,46.153846,healthy,30.769231,SLE
Savage2021_BRISL7_T0,HNSCC,healthy,COVID,HNSCC,healthy,NPC,NPC,NPC,COVID,NPC,HNSCC,NPC,30.769231,HNSCC,23.076923,SLE
Savage2021_PIDA_T0,HIV,healthy,HIV,healthy,healthy,healthy,healthy,HIV,healthy,NPC,healthy,healthy,53.846154,HIV,23.076923,healthy
