In [1]:
# Parameters
integration_method = "scGen"
n_embedded = 20
n_Patient_embedded = 0
batch = "chemistry"
annotation = "Level2"
embSpace = "cell"
MAINadata_path = "03_downstream_analysis/08_PatientClassifier/scGen/results/02_MAIN_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad"
VALIDATIONadata_path = "03_downstream_analysis/08_PatientClassifier/scGen/results/02_VALIDATION_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad"
EXCLUDEDadata_path = "03_downstream_analysis/08_PatientClassifier/scGen/results/02_EXCLUDED_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad"
EXTERNALadata_path = "03_downstream_analysis/08_PatientClassifier/scGen/results/02_EXTERNAL_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad"
optimization_metric = "F1"
n_job = 32


In [2]:
for v in ['MAINadata_path','VALIDATIONadata_path','EXCLUDEDadata_path','EXTERNALadata_path','integration_method',
          'embSpace','n_embedded','n_Patient_embedded','batch','annotation','optimization_metric', 'n_job']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}", end='\n')
    else:
        raise ValueError(f"{v} is not defined")

MAINadata_path = 03_downstream_analysis/08_PatientClassifier/scGen/results/02_MAIN_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
VALIDATIONadata_path = 03_downstream_analysis/08_PatientClassifier/scGen/results/02_VALIDATION_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
EXCLUDEDadata_path = 03_downstream_analysis/08_PatientClassifier/scGen/results/02_EXCLUDED_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
EXTERNALadata_path = 03_downstream_analysis/08_PatientClassifier/scGen/results/02_EXTERNAL_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
integration_method = scGen
embSpace = cell
n_embedded = 20
n_Patient_embedded = 0
batch = chemistry
annotation = Level2
optimization_metric = F1
n_job = 32


In [3]:
import os

from glob import glob

import optuna as op

from optuna.samplers import TPESampler, BruteForceSampler

import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

import joblib


from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, balanced_accuracy_score, matthews_corrcoef

from pyprojroot import here

import session_info

from tqdm import tqdm

In [4]:
op.logging.set_verbosity(op.logging.WARNING)

In [5]:
random_seed = 42

In [6]:
mainDir=here('03_downstream_analysis/08_PatientClassifier/Classifiers/results_CT_studies/')

### Defining functions

In [7]:
def _most_frequent_or_na(row):
    mode = row.mode()
    if len(mode) > 1:
        return 'nan'
    else:
        return mode.iloc[0]
            
class CellTypeDiseasePrediction():
        
    def __init__(self, clf_params, classifier, cell_type, y_true_col='disease', sample_id_col='sampleID'):
        self.clf_params = clf_params
        self.classifier = classifier
        self.ct = cell_type
        self.y_true_col = y_true_col
        self.sample_id_col = sample_id_col
        self.clfs = {}
        self.use_scaler = self.clf_params.pop('scaler')
        #self.le = LabelEncoder()
        
    def fit(self, adataTrain):

    
        #adataTrain.obs.loc[:,self.y_true_col] = self.le.fit_transform(adataTrain.obs[self.y_true_col])
        
        X_train = adataTrain.X
        y_train = adataTrain.obs[self.y_true_col]

        if self.use_scaler:
            self.scaler = StandardScaler().fit(X_train)
            X_train = self.scaler.transform(X_train)
   
        self.clfs = self.classifier(**self.clf_params).fit(X_train, y_train)

        return self

    def predict(self, adataVal, y_true_col='disease'):

        #adataVal.obs.loc[:,y_true_col] = self.le.transform(adataVal.obs[y_true_col])
        
        cell_type_pred = []
        
        X_val = adataVal.X

        if self.use_scaler:
            X_val = self.scaler.transform(X_val)
        
        y_pred = self.clfs.predict(X_val)

        cell_type_pred = pd.DataFrame({self.sample_id_col: adataVal.obs[self.sample_id_col],
                                       self.y_true_col: adataVal.obs[self.y_true_col],
                                       f'disease_pred_{self.ct}': y_pred}).set_index(self.sample_id_col)

        
        # cell_type_pred['disease_pred'] = cell_type_pred.apply(_most_frequent_or_na, axis=1)

        # cell_type_pred_merged = cell_type_pred.merge(adataVal.obs.groupby([self.sample_id_col], observed=True).agg({y_true_col:'first'}), 
        #                                              left_index=True, 
        #                                              right_index=True)
        

        return cell_type_pred

class VotingDisease():
    
    def __init__(self, classifier = None, cell_type_studies = None, fixed_params = None, 
                 cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease'):
        
        self.fixed_params = fixed_params
        self.classifier = classifier
        self.ct_studies = cell_type_studies
        self.cell_type_col = cell_type_col 
        self.y_true_col = y_true_col
        self.sample_id_col = sample_id_col
        self.clfs = {}

    def fit(self, adataR = None):
        
        for ct_i in self.ct_studies.keys(): 
    
            adataR_ct = adataR[adataR.obs[cell_type_col] == ct_i]
            # adataQ_ct = adataQ[adataQ.obs[cell_type_col] == ct_i]
        
            clf_params = self.fixed_params | self.ct_studies[ct_i].best_trial.params
        
            self.clfs[ct_i] = (CellTypeDiseasePrediction(clf_params=clf_params, 
                                                         classifier=self.classifier, 
                                                         cell_type = ct_i, 
                                                         sample_id_col=self.sample_id_col, 
                                                         y_true_col = self.y_true_col)
                                .fit(adataR_ct))
        return self
            
    def predict(self, adataQ = None):
        
        y_pred_query_list = []
        
        for ct_i in self.ct_studies.keys(): 
    
            adataQ_ct = adataQ[adataQ.obs[self.cell_type_col] == ct_i]
        
            y_pred_query_list.append(self.clfs[ct_i].predict(adataQ_ct, y_true_col = self.y_true_col).iloc[:,1])
        
        cellType_prediction_df = pd.concat(y_pred_query_list, axis=1, ignore_index=False, join='outer')
        cellType_prediction_df['majority_pred'] = cellType_prediction_df.apply(_most_frequent_or_na, axis=1)

        cellType_prediction_df = (cellType_prediction_df.merge(adataQ.obs[[self.sample_id_col, self.y_true_col]].drop_duplicates(), 
                                                               left_index = True, right_on = self.sample_id_col)
                                  .set_index(self.sample_id_col))

        return cellType_prediction_df

### Genereting output directories

In [8]:
dir_name_list = ['best_models', 'label_predicted_dataframes','metrics_dataframes','study_objects']
for dn in dir_name_list:
    os.makedirs(os.path.join(mainDir, dn), exist_ok=True)

### Loading data

In [9]:
adataPathList = glob(str(here(MAINadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    MAINpseudobulk = sc.read_h5ad(adataPathList[0])

adataPathList = glob(str(here(VALIDATIONadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    VALIDATIONpseudobulk = sc.read_h5ad(adataPathList[0])

adataPathList = glob(str(here(EXCLUDEDadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    EXCLUDEDpseudobulk = sc.read_h5ad(adataPathList[0])

adataPathList = glob(str(here(EXTERNALadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    EXTERNALpseudobulk = sc.read_h5ad(adataPathList[0])

02_MAIN_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
02_VALIDATION_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad
02_EXCLUDED_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad


02_EXTERNAL_scGenIntegrated_PSEUDOBULK_20nLT_chemistry_Level2.h5ad


In [10]:
# Renaiming obs col names for consistency
for ad_i in [VALIDATIONpseudobulk,EXCLUDEDpseudobulk,EXTERNALpseudobulk]:
    ad_i.obs.rename({'Level1_pred':'Level1', 'disease_true':'disease'},axis=1,inplace=True)
    ad_i.obs['disease'] = ad_i.obs['disease'].astype('str')

In [11]:
MAINpseudobulk

AnnData object with n_obs × n_vars = 11372 × 20
    obs: 'sampleID', 'disease', 'Level1', 'batch', 'studyID', 'n_observation'

In [12]:
VALIDATIONpseudobulk

AnnData object with n_obs × n_vars = 1981 × 20
    obs: 'sampleID', 'disease', 'Level1', 'batch', 'studyID', 'n_observation'

### Removing unwanted cell-types

In [13]:
MAINpseudobulkFilt = MAINpseudobulk[~MAINpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()
VALIDATIONpseudobulkFilt = VALIDATIONpseudobulk[~VALIDATIONpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()
EXCLUDEDpseudobulkFilt = EXCLUDEDpseudobulk[~EXCLUDEDpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()
EXTERNALpseudobulkFilt = EXTERNALpseudobulk[~EXTERNALpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()

In [14]:
MAINpseudobulkFilt

AnnData object with n_obs × n_vars = 8816 × 20
    obs: 'sampleID', 'disease', 'Level1', 'batch', 'studyID', 'n_observation'

In [15]:
VALIDATIONpseudobulkFilt.obs.Level1.unique().tolist()

['B',
 'DC',
 'ILC',
 'Mono',
 'Plasma',
 'T_CD4_Naive',
 'T_CD4_NonNaive',
 'T_CD8_Naive',
 'T_CD8_NonNaive',
 'UTC',
 'pDC']

In [16]:
MAINpseudobulkFilt.obs['disease'] = MAINpseudobulkFilt.obs['disease'].astype(str)

## Input data and main parameters

In [17]:
cell_type_col='Level1'
y_true_col='disease'
sample_id_col='sampleID'

**Initialize pandas dataframe to collect results**

In [18]:
METRICdf = pd.DataFrame(columns=['integration_method','embedded_space','n_embedded','n_Patient_embedded','batchKeys','annotation',
                      'classification_method','VALIDATION_F1','EXTERNAL_F1','VALIDATION_BAS','EXTERNAL_BAS','VALIDATION_MCC','EXTERNAL_MCC','avg_TRAIN_F1'])
METRICdf

Unnamed: 0,integration_method,embedded_space,n_embedded,n_Patient_embedded,batchKeys,annotation,classification_method,VALIDATION_F1,EXTERNAL_F1,VALIDATION_BAS,EXTERNAL_BAS,VALIDATION_MCC,EXTERNAL_MCC,avg_TRAIN_F1


### Hyper parameter tuning and classification

The aim is to define the best classifier and its corresponding hyper-parameters using **MAIN adata** as train dataset and **VALIDATION adata** as validation.

Then, with the best model/setting for each model family (i.e., SVC, kNN, NN, GBM) we will classify samples in VALIDATION, EXCLUDED and EXTERNAL datasets

In [19]:
from sklearn.model_selection import StratifiedKFold

In [20]:
sKf = StratifiedKFold(n_splits = 5)

In [21]:
Pdata = MAINpseudobulkFilt.obs.groupby(sample_id_col, observed=True).agg({y_true_col:'first'}).reset_index()

In [22]:
KfoldPatientSplit = list(sKf.split(X = Pdata[sample_id_col], y=Pdata[y_true_col]))

#### LinearSVC

In [23]:
fixed_parameter_LinearSVC = ({
    'max_iter': 1000000,
    'dual': 'auto',
})

def LinearSVC_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         random_seed = 42,
                         optimization = 'F1', 
                         n_trials = 50,
                         n_job = 50):

    LinearSVC_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
                'C': trial.suggest_float('C', 1e-3, 1e5, log=True)
            }
        
            class_params |= fixed_parameter_LinearSVC
        
            # for ct_ in adataTrain.obs.Level1.unique().tolist():
            #     class_params.update({
            #         f"{ct_}_C": trial.suggest_float(f"{ct_}_C", 1e-3, 1e5, log=True),
            #     })
        
            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=LinearSVC, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        LinearSVC_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i,sampler=TPESampler(seed=random_seed))
        LinearSVC_cell_type_studies[ct_i].optimize(objective_CT, n_trials=n_trials, show_progress_bar=True, n_jobs=n_job)#catch = ValueError)

    return LinearSVC_cell_type_studies

### EXECUTE THE FUNCTION ###
LinearSVC_cell_type_studies = LinearSVC_cellTypeOptimization(adataTrain = MAINpseudobulkFilt, 
                                                               KfoldPatientSplit = KfoldPatientSplit,
                                                               y_true_col = 'disease',
                                                               sample_id_col = 'sampleID',
                                                               cell_type_col = 'Level1',
                                                               optimization = 'F1',
                                                               n_trials = 100, n_job = n_job, random_seed = 42)


B


  0%|          | 0/100 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[k] = df_sub[k].cat.remove_unused_categories()



DC


  0%|          | 0/100 [00:00<?, ?it/s]


ILC


  0%|          | 0/100 [00:00<?, ?it/s]


Mono


  0%|          | 0/100 [00:00<?, ?it/s]


Plasma


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD4_Naive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD4_NonNaive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD8_Naive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD8_NonNaive


  0%|          | 0/100 [00:00<?, ?it/s]


UTC


  0%|          | 0/100 [00:00<?, ?it/s]


pDC


  0%|          | 0/100 [00:00<?, ?it/s]

In [24]:
# Training classifier with parameters optimized for each cell-type
clf_LinearSVC_TRAINED = (VotingDisease(classifier = LinearSVC, 
                                         cell_type_studies = LinearSVC_cell_type_studies, 
                                         fixed_params = fixed_parameter_LinearSVC, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(MAINpseudobulkFilt))

In [25]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in LinearSVC_cell_type_studies.keys():
    optMetric += LinearSVC_cell_type_studies[ct_i].best_value
f1_LinearSVC_REFERENCE = optMetric / len(LinearSVC_cell_type_studies)
print(f"F1 on reference = {f1_LinearSVC_REFERENCE}")

F1 on reference = 0.5933463084266356


**predicting disease in VALIDATION and EXTERNAL datasets**

In [26]:
# VALIDATION
LinearSVC_cellType_prediction_VALIDATION_df = clf_LinearSVC_TRAINED.predict(VALIDATIONpseudobulkFilt)

f1_LinearSVC_VAL = f1_score(LinearSVC_cellType_prediction_VALIDATION_df[y_true_col], 
                              LinearSVC_cellType_prediction_VALIDATION_df.majority_pred, average='weighted')
BAS_LinearSVC_VAL = balanced_accuracy_score(LinearSVC_cellType_prediction_VALIDATION_df[y_true_col], 
                                              LinearSVC_cellType_prediction_VALIDATION_df.majority_pred,)
MCC_LinearSVC_VAL = matthews_corrcoef(LinearSVC_cellType_prediction_VALIDATION_df[y_true_col], 
                                        LinearSVC_cellType_prediction_VALIDATION_df.majority_pred,)
print(f"QUERY: F1 = {f1_LinearSVC_VAL}, BAS = {BAS_LinearSVC_VAL}, MCC: {MCC_LinearSVC_VAL}")


# EXTERNAL
LinearSVC_cellType_prediction_EXTERNAL_df = clf_LinearSVC_TRAINED.predict(EXTERNALpseudobulkFilt)

f1_LinearSVC_EXT = f1_score(LinearSVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                              LinearSVC_cellType_prediction_EXTERNAL_df.majority_pred, average='weighted')
BAS_LinearSVC_EXT = balanced_accuracy_score(LinearSVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                                              LinearSVC_cellType_prediction_EXTERNAL_df.majority_pred,)
MCC_LinearSVC_EXT = matthews_corrcoef(LinearSVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                                        LinearSVC_cellType_prediction_EXTERNAL_df.majority_pred,)
print(f"QUERY: F1 = {f1_LinearSVC_EXT}, BAS = {BAS_LinearSVC_EXT}, MCC: {MCC_LinearSVC_EXT}")


QUERY: F1 = 0.892821265913672, BAS = 0.8494417211328976, MCC: 0.852060303214856
QUERY: F1 = 0.36101882613510516, BAS = 0.16544117647058823, MCC: 0.2529850198158424




In [27]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(LinearSVC_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_LinearSVC.pkl"))
joblib.dump(clf_LinearSVC_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_LinearSVC.pkl"))
# LinearSVC_labelPred_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/LinearSVC_SPLIT_{SPLIT}.pkl"))
LinearSVC_cellType_prediction_VALIDATION_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_LinearSVC_VALIDATION.pkl"))
LinearSVC_cellType_prediction_EXTERNAL_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_LinearSVC_EXTERNAL.pkl"))

In [28]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'LinearSVC',
                                     f1_LinearSVC_VAL,f1_LinearSVC_EXT,
                                     BAS_LinearSVC_VAL,BAS_LinearSVC_EXT,
                                     MCC_LinearSVC_VAL,MCC_LinearSVC_EXT,
                                    f1_LinearSVC_REFERENCE]

#### SVC

In [29]:
fixed_parameter_SVC = ({
    'max_iter': -1,
})


def SVC_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         random_seed = 42,
                         optimization = 'F1', 
                         n_trials = 300,
                         n_job = -1):

    SVC_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'kernel': trial.suggest_categorical("kernel", ['sigmoid','rbf','poly']),
                'class_weight': trial.suggest_categorical("class_weight", ['balanced',None]),
                'C': trial.suggest_float('C', 1e-3, 1e5, log=True),
            }

            class_params |= fixed_parameter_SVC
            if class_params['kernel'] == 'poly':
                class_params.update({'degree': trial.suggest_int('degree', 2,6)})    
                
            # for ct_ in adataTrain.obs.Level1.unique().tolist():
            #     class_params.update({
            #         f"{ct_}_C": trial.suggest_float(f"{ct_}_C", 1e-3, 1e5, log=True),
            #     })
        
            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=SVC, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        SVC_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i,sampler=TPESampler(seed=random_seed))
        SVC_cell_type_studies[ct_i].optimize(objective_CT, n_trials=n_trials, show_progress_bar=True, n_jobs=n_job)#catch = ValueError)

    return SVC_cell_type_studies

### EXECUTE THE FUNCTION ###
SVC_cell_type_studies = SVC_cellTypeOptimization(adataTrain = MAINpseudobulkFilt, 
                                                   KfoldPatientSplit = KfoldPatientSplit,
                                                   y_true_col = 'disease',
                                                   sample_id_col = 'sampleID',
                                                   cell_type_col = 'Level1',
                                                   optimization = 'F1',
                                                   n_trials = 300, n_job = n_job, random_seed = 42)


B


  0%|          | 0/300 [00:00<?, ?it/s]


DC


  0%|          | 0/300 [00:00<?, ?it/s]


ILC


  0%|          | 0/300 [00:00<?, ?it/s]


Mono


  0%|          | 0/300 [00:00<?, ?it/s]


Plasma


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD4_Naive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD4_NonNaive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD8_Naive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD8_NonNaive


  0%|          | 0/300 [00:00<?, ?it/s]


UTC


  0%|          | 0/300 [00:00<?, ?it/s]


pDC


  0%|          | 0/300 [00:00<?, ?it/s]

In [30]:
# Training classifier with parameters optimized for each cell-type
clf_SVC_TRAINED = (VotingDisease(classifier = SVC, 
                                         cell_type_studies = SVC_cell_type_studies, 
                                         fixed_params = fixed_parameter_SVC, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(MAINpseudobulkFilt))

In [31]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in SVC_cell_type_studies.keys():
    optMetric += SVC_cell_type_studies[ct_i].best_value
f1_SVC_REFERENCE = optMetric / len(SVC_cell_type_studies)
print(f"F1 on reference = {f1_SVC_REFERENCE}")

F1 on reference = 0.698779577587197


**predicting disease in VALIDATION and EXTERNAL datasets**

In [32]:
# VALIDATION
SVC_cellType_prediction_VALIDATION_df = clf_SVC_TRAINED.predict(VALIDATIONpseudobulkFilt)

f1_SVC_VAL = f1_score(SVC_cellType_prediction_VALIDATION_df[y_true_col], 
                              SVC_cellType_prediction_VALIDATION_df.majority_pred, average='weighted')
BAS_SVC_VAL = balanced_accuracy_score(SVC_cellType_prediction_VALIDATION_df[y_true_col], 
                                              SVC_cellType_prediction_VALIDATION_df.majority_pred,)
MCC_SVC_VAL = matthews_corrcoef(SVC_cellType_prediction_VALIDATION_df[y_true_col], 
                                        SVC_cellType_prediction_VALIDATION_df.majority_pred,)
print(f"QUERY: F1 = {f1_SVC_VAL}, BAS = {BAS_SVC_VAL}, MCC: {MCC_SVC_VAL}")


# EXTERNAL
SVC_cellType_prediction_EXTERNAL_df = clf_SVC_TRAINED.predict(EXTERNALpseudobulkFilt)

f1_SVC_EXT = f1_score(SVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                              SVC_cellType_prediction_EXTERNAL_df.majority_pred, average='weighted')
BAS_SVC_EXT = balanced_accuracy_score(SVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                                              SVC_cellType_prediction_EXTERNAL_df.majority_pred,)
MCC_SVC_EXT = matthews_corrcoef(SVC_cellType_prediction_EXTERNAL_df[y_true_col], 
                                        SVC_cellType_prediction_EXTERNAL_df.majority_pred,)
print(f"QUERY: F1 = {f1_SVC_EXT}, BAS = {BAS_SVC_EXT}, MCC: {MCC_SVC_EXT}")


QUERY: F1 = 0.9606126200762726, BAS = 0.914351851851852, MCC: 0.9384062227663746
QUERY: F1 = 0.33828137885535214, BAS = 0.15418198529411764, MCC: 0.2273056152120926




In [33]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(SVC_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_SVC.pkl"))
joblib.dump(clf_SVC_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_SVC.pkl"))
# SVC_labelPred_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/SVC_SPLIT_{SPLIT}.pkl"))
SVC_cellType_prediction_VALIDATION_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_SVC_VALIDATION.pkl"))
SVC_cellType_prediction_EXTERNAL_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_SVC_EXTERNAL.pkl"))

In [34]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'SVC',
                                     f1_SVC_VAL,f1_SVC_EXT,
                                     BAS_SVC_VAL,BAS_SVC_EXT,
                                     MCC_SVC_VAL,MCC_SVC_EXT,
                                    f1_SVC_REFERENCE]

#### kNN

In [35]:
fixed_parameter_kNN = ({
    'n_jobs': 1
})

def kNN_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         optimization = 'F1', 
                         n_job = 40):

    kNN_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'metric': trial.suggest_categorical("metric", ['cosine','euclidean']),
                'weights': trial.suggest_categorical("weights", ['uniform','distance']),
                'n_neighbors': trial.suggest_int('n_neighbors', 1, 5, log=False),
            }

            class_params |= fixed_parameter_kNN

            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=KNeighborsClassifier, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())
        kNN_cell_type_studies[ct_i].optimize(objective_CT, show_progress_bar=False, n_jobs=40)#catch = ValueError)

    return kNN_cell_type_studies

### EXECUTE THE FUNCTION ###
kNN_cell_type_studies = kNN_cellTypeOptimization(adataTrain = MAINpseudobulkFilt, 
                                                 KfoldPatientSplit = KfoldPatientSplit,
                                                 y_true_col = 'disease',
                                                 sample_id_col = 'sampleID',
                                                 cell_type_col = 'Level1',
                                                 optimization = 'F1',
                                                 n_job = min(40,n_job))


B


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



DC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



ILC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



Mono


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



Plasma


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD4_Naive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD4_NonNaive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD8_Naive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD8_NonNaive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



UTC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



pDC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())


In [36]:
# Training classifier with parameters optimized for each cell-type
clf_kNN_TRAINED = (VotingDisease(classifier = KNeighborsClassifier, 
                                         cell_type_studies = kNN_cell_type_studies, 
                                         fixed_params = fixed_parameter_kNN, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(MAINpseudobulkFilt))

In [37]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in kNN_cell_type_studies.keys():
    optMetric += kNN_cell_type_studies[ct_i].best_value
f1_kNN_REFERENCE = optMetric / len(kNN_cell_type_studies)
print(f"F1 on reference = {f1_kNN_REFERENCE}")

F1 on reference = 0.6427197517447438


**predicting disease in VALIDATION and EXTERNAL datasets**

In [38]:
# VALIDATION
kNN_cellType_prediction_VALIDATION_df = clf_kNN_TRAINED.predict(VALIDATIONpseudobulkFilt)

f1_kNN_VAL = f1_score(kNN_cellType_prediction_VALIDATION_df[y_true_col], 
                              kNN_cellType_prediction_VALIDATION_df.majority_pred, average='weighted')
BAS_kNN_VAL = balanced_accuracy_score(kNN_cellType_prediction_VALIDATION_df[y_true_col], 
                                              kNN_cellType_prediction_VALIDATION_df.majority_pred,)
MCC_kNN_VAL = matthews_corrcoef(kNN_cellType_prediction_VALIDATION_df[y_true_col], 
                                        kNN_cellType_prediction_VALIDATION_df.majority_pred,)
print(f"QUERY: F1 = {f1_kNN_VAL}, BAS = {BAS_kNN_VAL}, MCC: {MCC_kNN_VAL}")


# EXTERNAL
kNN_cellType_prediction_EXTERNAL_df = clf_kNN_TRAINED.predict(EXTERNALpseudobulkFilt)

f1_kNN_EXT = f1_score(kNN_cellType_prediction_EXTERNAL_df[y_true_col], 
                              kNN_cellType_prediction_EXTERNAL_df.majority_pred, average='weighted')
BAS_kNN_EXT = balanced_accuracy_score(kNN_cellType_prediction_EXTERNAL_df[y_true_col], 
                                              kNN_cellType_prediction_EXTERNAL_df.majority_pred,)
MCC_kNN_EXT = matthews_corrcoef(kNN_cellType_prediction_EXTERNAL_df[y_true_col], 
                                        kNN_cellType_prediction_EXTERNAL_df.majority_pred,)
print(f"QUERY: F1 = {f1_kNN_EXT}, BAS = {BAS_kNN_EXT}, MCC: {MCC_kNN_EXT}")




QUERY: F1 = 0.8884191776835115, BAS = 0.7666666666666667, MCC: 0.8504080189878717


QUERY: F1 = 0.2641621943947525, BAS = 0.1256127450980392, MCC: 0.15702953795171695




In [39]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(kNN_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_kNN.pkl"))
joblib.dump(clf_kNN_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_kNN.pkl"))
# kNN_labelPred_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/kNN_SPLIT_{SPLIT}.pkl"))
kNN_cellType_prediction_VALIDATION_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_kNN_VALIDATION.pkl"))
kNN_cellType_prediction_EXTERNAL_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_kNN_EXTERNAL.pkl"))

In [40]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'kNN',
                                     f1_kNN_VAL,f1_kNN_EXT,
                                     BAS_kNN_VAL,BAS_kNN_EXT,
                                     MCC_kNN_VAL,MCC_kNN_EXT,
                                    f1_kNN_REFERENCE]

### Saving dataframe with results

In [41]:
METRICdf.to_pickle(os.path.join(mainDir, f"metrics_dataframes/METRICdf_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}.pkl"))

In [42]:
METRICdf

Unnamed: 0,integration_method,embedded_space,n_embedded,n_Patient_embedded,batchKeys,annotation,classification_method,VALIDATION_F1,EXTERNAL_F1,VALIDATION_BAS,EXTERNAL_BAS,VALIDATION_MCC,EXTERNAL_MCC,avg_TRAIN_F1
0,scGen,cell,20,0,chemistry,Level2,LinearSVC,0.892821,0.361019,0.849442,0.165441,0.85206,0.252985,0.593346
1,scGen,cell,20,0,chemistry,Level2,SVC,0.960613,0.338281,0.914352,0.154182,0.938406,0.227306,0.69878
2,scGen,cell,20,0,chemistry,Level2,kNN,0.888419,0.264162,0.766667,0.125613,0.850408,0.15703,0.64272


In [43]:
session_info.show()