In [1]:
# Parameters
nPC = 20
batchKeys = "chemistry"
queryDataset = "VALIDATION"


In [2]:
for v in ['nPC','batchKeys', 'queryDataset']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}")
    else:
        raise Exception(f"{v} not specified")

nPC = 20
batchKeys = chemistry
queryDataset = VALIDATION


In [3]:
import os
import sys

import scanpy as sc
import pandas as pd
import numpy as np

import symphonypy as sp

from sklearn.metrics import balanced_accuracy_score

from pyprojroot import here

sys.path.insert(1, str(here('bin')))
# Import custom functions
from customPythonFunctions import aggregating_features, train_patient_classifier, vote_patient_disease

random_seed = 42

### Parameters

In [4]:
batch_key_query = batchKeys.split(',')
batch_key_query

['chemistry']

### Loading dataset

**Integrated object**

In [5]:
adataR = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_{nPC}nPC_Batch_{batchKeys}.h5ad"), backed='r')
adataR

AnnData object with n_obs × n_vars = 4435922 × 8253 backed at '/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_20nPC_Batch_chemistry.h5ad'
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable', 'mean', 'std'
    uns: 'harmony', 'log1p', 'pca'
    obsm: 'X_pca', 'X_pca_harmony'
    varm: 'PCs'

**Query dataset**

In [6]:
adataQ = sc.read_h5ad(here(f'03_downstream_analysis/02_gene_universe_definition/results/05_{queryDataset}_geneUniverse.h5ad'))
adataQ

AnnData object with n_obs × n_vars = 849922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'

### Preprocessing

In [7]:
sc.pp.normalize_total(adataQ, target_sum=1e4)
sc.pp.log1p(adataQ)

### Projection with Symphony

In [8]:
sp.tl.map_embedding(adataQ, adataR, key=batch_key_query)

In [9]:
adataQ

AnnData object with n_obs × n_vars = 849922 × 8253
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'technology', 'disease', 'sex', 'binned_age'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status'
    uns: 'log1p'
    obsm: 'X_pca_reference', 'X_pca_harmony', 'X_pca_harmony_symphony_R'

### Label transfer

In [10]:
# common embedded space -> adataQ.obsm['X_pca_harmony']

**Level1 Annotation**

In [11]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'Level1', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

**Disease**

In [12]:
adataQ.obs.rename({'disease':'disease_true'}, axis=1, inplace=True)

In [13]:
sp.tl.transfer_labels_kNN(adataQ, adataR, 'disease', ref_basis = 'X_pca_harmony', query_basis = 'X_pca_harmony')

In [14]:
adataQ.obs.rename({'disease':'disease_pred'}, axis=1, inplace=True)

In [15]:
balanced_accuracy_score(y_true = adataQ.obs.disease_true, y_pred=adataQ.obs.disease_pred)



0.38627400896008024

In [16]:
adataQ.obs

Unnamed: 0_level_0,studyID,libraryID,sampleID,chemistry,technology,disease_true,sex,binned_age,Level1,disease_pred
cellID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SCGT00_L051_I51.3P_T0_AAACCCACAATCAGCT,SCGT00,SCGT00_L051,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,41-50,ILC,RA
SCGT00_L051_I51.3P_T0_AAACCCAGTACAATAG,SCGT00,SCGT00_L051,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,41-50,Mono,SLE
SCGT00_L051_I51.3P_T0_AAACGAACAACAGATA,SCGT00,SCGT00_L051,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,41-50,ILC,RA
SCGT00_L051_I51.3P_T0_AAACGAAGTGATACCT,SCGT00,SCGT00_L051,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,41-50,ILC,RA
SCGT00_L051_I51.3P_T0_AAACGAAGTTCTAACG,SCGT00,SCGT00_L051,SCGT00_I51.3P_T0,3_GEX_V3,3_GEX_V3_GenoHashed,RA,female,41-50,T_CD4_NonNaive,UC
...,...,...,...,...,...,...,...,...,...,...
Ren2021_L065_PS052_T0_TTTGTCAAGTGTACCT,Ren2021,Ren2021_L065,Ren2021_PS052_T0,5_GEX_V2,5_GEX_V2,COVID,female,>80,T_CD4_Naive,COVID
Ren2021_L065_PS052_T0_TTTGTCACAGGACCCT,Ren2021,Ren2021_L065,Ren2021_PS052_T0,5_GEX_V2,5_GEX_V2,COVID,female,>80,T_CD4_Naive,COVID
Ren2021_L065_PS052_T0_TTTGTCAGTCCGTCAG,Ren2021,Ren2021_L065,Ren2021_PS052_T0,5_GEX_V2,5_GEX_V2,COVID,female,>80,T_CD4_Naive,COVID
Ren2021_L065_PS052_T0_TTTGTCAGTCGCGTGT,Ren2021,Ren2021_L065,Ren2021_PS052_T0,5_GEX_V2,5_GEX_V2,COVID,female,>80,Mono,COVID


### Generating pseudobulk from latent space for Q dataset

In [17]:
emb_pseudobulk_Q = aggregating_features(Z = adataQ.obsm['X_pca_harmony'], obsDF = adataQ.obs[['sampleID','Level1','disease_true']], mode = 'mean', obs_names_col=['sampleID','Level1'])

In [18]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
COMBAT2022_G05073_T0_B,COMBAT2022_G05073_T0,B,COVID,519
COMBAT2022_G05073_T0_Cycling_cells,COMBAT2022_G05073_T0,Cycling_cells,COVID,58
COMBAT2022_G05073_T0_DC,COMBAT2022_G05073_T0,DC,COVID,108
COMBAT2022_G05073_T0_ILC,COMBAT2022_G05073_T0,ILC,COVID,337
COMBAT2022_G05073_T0_Mono,COMBAT2022_G05073_T0,Mono,COVID,1673
...,...,...,...,...
Terekhova2023_FE05_T0_T_CD4_NonNaive,Terekhova2023_FE05_T0,T_CD4_NonNaive,healthy,958
Terekhova2023_FE05_T0_T_CD8_Naive,Terekhova2023_FE05_T0,T_CD8_Naive,healthy,266
Terekhova2023_FE05_T0_T_CD8_NonNaive,Terekhova2023_FE05_T0,T_CD8_NonNaive,healthy,1216
Terekhova2023_FE05_T0_UTC,Terekhova2023_FE05_T0,UTC,healthy,134


**Loading training pseudobulk**

In [19]:
emb_pseudobulk_train = sc.read_h5ad(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/01_MAIN_HarmonyIntegrated_PSEUDOBULK_{nPC}nPC_Batch_{batchKeys}.h5ad"))

#### Remove unwanted cell types

In [20]:
emb_pseudobulk_train = emb_pseudobulk_train[~emb_pseudobulk_train.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

In [21]:
emb_pseudobulk_Q = emb_pseudobulk_Q[~emb_pseudobulk_Q.obs.Level1.isin(['Cycling_cells','RBC','Progenitors','Platelets']),]

### Saving query dataset and pseudobulks

In [22]:
adataQ.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

In [23]:
emb_pseudobulk_Q.write(here(f"03_downstream_analysis/08_PatientClassifier/Harmony_Symphony/results/02_{queryDataset}_SymphonyProjected_PSEUDOBULK{nPC}nPC_Batch_{batchKeys}.h5ad"), 
             compression='gzip')

  df[key] = c
  df[key] = c
  df[key] = c


### Training linear classifiers

In [24]:
clfList = train_patient_classifier(adataTrain = emb_pseudobulk_train, cell_type_col = 'Level1', y_true_col = 'disease', max_iter=-1, random_state = 25, model = 'KNeighborsClassifier', 
                           kargs_model={'n_neighbors':2, 'weights':'uniform', 'metric':'cosine','n_jobs':-1})

  0%|          | 0/11 [00:00<?, ?it/s]

  9%|▉         | 1/11 [00:00<00:01,  8.59it/s]

 27%|██▋       | 3/11 [00:00<00:01,  7.25it/s]

 45%|████▌     | 5/11 [00:00<00:00,  9.95it/s]

 64%|██████▎   | 7/11 [00:00<00:00, 11.62it/s]

 82%|████████▏ | 9/11 [00:00<00:00, 12.61it/s]

100%|██████████| 11/11 [00:00<00:00, 13.43it/s]

100%|██████████| 11/11 [00:00<00:00, 11.72it/s]




In [25]:
clfList

{'B': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.893909913186229,
  'nObs': 814},
 'DC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8484375597011713,
  'nObs': 801},
 'ILC': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8243337202566261,
  'nObs': 817},
 'Mono': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8433445704146223,
  'nObs': 817},
 'Plasma': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.811216097575144,
  'nObs': 727},
 'T_CD4_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.880433455562023,
  'nObs': 814},
 'T_CD4_NonNaive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bAcc': 0.8790256347104011,
  'nObs': 816},
 'T_CD8_Naive': {'clf': KNeighborsClassifier(metric='cosine', n_jobs=-1, n_neighbors=2),
  'bA

### Predicting diseases

In [26]:
emb_pseudobulk_Q.obs

Unnamed: 0,sampleID,Level1,disease_true,n_observation
COMBAT2022_G05073_T0_B,COMBAT2022_G05073_T0,B,COVID,519
COMBAT2022_G05073_T0_DC,COMBAT2022_G05073_T0,DC,COVID,108
COMBAT2022_G05073_T0_ILC,COMBAT2022_G05073_T0,ILC,COVID,337
COMBAT2022_G05073_T0_Mono,COMBAT2022_G05073_T0,Mono,COVID,1673
COMBAT2022_G05073_T0_Plasma,COMBAT2022_G05073_T0,Plasma,COVID,19
...,...,...,...,...
Terekhova2023_FE05_T0_T_CD4_NonNaive,Terekhova2023_FE05_T0,T_CD4_NonNaive,healthy,958
Terekhova2023_FE05_T0_T_CD8_Naive,Terekhova2023_FE05_T0,T_CD8_Naive,healthy,266
Terekhova2023_FE05_T0_T_CD8_NonNaive,Terekhova2023_FE05_T0,T_CD8_NonNaive,healthy,1216
Terekhova2023_FE05_T0_UTC,Terekhova2023_FE05_T0,UTC,healthy,134


In [27]:
classificationDF = vote_patient_disease(adataTest = emb_pseudobulk_Q, 
                                clfList = clfList, 
                                sample_id_col = 'sampleID',
                                cell_type_col = 'Level1')


  0%|          | 0/11 [00:00<?, ?it/s]

 36%|███▋      | 4/11 [00:00<00:00, 37.35it/s]

 73%|███████▎  | 8/11 [00:00<00:00, 37.51it/s]

100%|██████████| 11/11 [00:00<00:00, 37.46it/s]




  0%|          | 0/144 [00:00<?, ?it/s]

 72%|███████▏  | 104/144 [00:00<00:00, 1039.28it/s]

100%|██████████| 144/144 [00:00<00:00, 1040.13it/s]




In [28]:
res = classificationDF.merge(emb_pseudobulk_Q.obs[['sampleID', 'disease_true']].drop_duplicates(), how = 'left', on = 'sampleID').set_index('sampleID')


In [29]:
balanced_accuracy_score(y_true = res.disease_true, y_pred=res.firstChoice)



0.6176742919389979

In [30]:
res

Unnamed: 0_level_0,B_prediction,DC_prediction,ILC_prediction,Mono_prediction,Plasma_prediction,T_CD4_Naive_prediction,T_CD4_NonNaive_prediction,T_CD8_Naive_prediction,T_CD8_NonNaive_prediction,UTC_prediction,pDC_prediction,firstChoice,firstChoice_perc,secondChoice,secondChoice_perc,disease_true
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
COMBAT2022_G05073_T0,PS,SLE,COVID,COVID,healthy,COVID,PSA,COVID,SLE,PSA,healthy,COVID,30.769231,SLE,15.384615,COVID
COMBAT2022_G05097_T0,CD,COVID,healthy,COVID,healthy,healthy,healthy,SLE,SLE,healthy,COVID,healthy,38.461538,COVID,23.076923,COVID
COMBAT2022_G05145_T0,healthy,healthy,COVID,healthy,SLE,COVID,COVID,COVID,COVID,COVID,healthy,COVID,46.153846,healthy,30.769231,COVID
COMBAT2022_N00021_T0,healthy,healthy,SLE,sepsis,SLE,flu,healthy,RA,healthy,SLE,COVID,healthy,30.769231,SLE,23.076923,sepsis
COMBAT2022_N00025_T0,SLE,,COVID,SLE,COVID,healthy,CRC,healthy,sepsis,sepsis,,SLE,18.181818,COVID,18.181818,sepsis
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Terekhova2023_E19_T0,healthy,healthy,COPD,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,76.923077,,15.384615,healthy
Terekhova2023_FA07_T0,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,CRC,healthy,76.923077,,15.384615,healthy
Terekhova2023_FC04_T0,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,84.615385,,15.384615,healthy
Terekhova2023_FD01_T0,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,healthy,84.615385,,15.384615,healthy
