In [1]:
# Parameters
query_dataset = "VALIDATION"
n_embedded = 50
n_Patient_embedded = 100
embSpace = "cell"
integration_method = "scPoli"
batch = "chemistry"
annotation = "Level1n2"
REFERENCEadata_path = "03_downstream_analysis/08_PatientClassifier/scPoli/results/PSEUDOBULKs/scPoli_PSEUDOBULKcell_ref_latents_VALIDATION_2_50_100_*.h5ad"
QUERYNadata_path = "03_downstream_analysis/08_PatientClassifier/scPoli/results/PSEUDOBULKs/scPoli_PSEUDOBULKcell_VALIDATION_2_50_100_*.h5ad"
optimization_metric = "F1"
n_job = 32


In [2]:
for v in ['REFERENCEadata_path','QUERYNadata_path','query_dataset','integration_method',
          'embSpace','n_embedded','n_Patient_embedded','batch','annotation','optimization_metric','n_job']:
    if v in locals() or v in globals():
        print(f"{v} = {eval(v)}", end='\n')
    else:
        raise ValueError(f"{v} is not defined")

REFERENCEadata_path = 03_downstream_analysis/08_PatientClassifier/scPoli/results/PSEUDOBULKs/scPoli_PSEUDOBULKcell_ref_latents_VALIDATION_2_50_100_*.h5ad
QUERYNadata_path = 03_downstream_analysis/08_PatientClassifier/scPoli/results/PSEUDOBULKs/scPoli_PSEUDOBULKcell_VALIDATION_2_50_100_*.h5ad
query_dataset = VALIDATION
integration_method = scPoli
embSpace = cell
n_embedded = 50
n_Patient_embedded = 100
batch = chemistry
annotation = Level1n2
optimization_metric = F1
n_job = 32


In [3]:
import os

from glob import glob

import optuna as op

from optuna.samplers import TPESampler, BruteForceSampler

import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt

import joblib


from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, balanced_accuracy_score, matthews_corrcoef

from pyprojroot import here

from tqdm import tqdm

In [4]:
op.logging.set_verbosity(op.logging.WARNING)

In [5]:
random_seed = 42

In [6]:
mainDir=here('03_downstream_analysis/08_PatientClassifier/Classifiers/results_CT_studies/')

### Defining functions

In [7]:
def _most_frequent_or_na(row):
    mode = row.mode()
    if len(mode) > 1:
        return 'nan'
    else:
        return mode.iloc[0]
            
class CellTypeDiseasePrediction():
        
    def __init__(self, clf_params, classifier, cell_type, y_true_col='disease', sample_id_col='sampleID'):
        self.clf_params = clf_params
        self.classifier = classifier
        self.ct = cell_type
        self.y_true_col = y_true_col
        self.sample_id_col = sample_id_col
        self.clfs = {}
        self.use_scaler = self.clf_params.pop('scaler')
        #self.le = LabelEncoder()
        
    def fit(self, adataTrain):

    
        #adataTrain.obs.loc[:,self.y_true_col] = self.le.fit_transform(adataTrain.obs[self.y_true_col])
        
        X_train = adataTrain.X
        y_train = adataTrain.obs[self.y_true_col]

        if self.use_scaler:
            self.scaler = StandardScaler().fit(X_train)
            X_train = self.scaler.transform(X_train)
   
        self.clfs = self.classifier(**self.clf_params).fit(X_train, y_train)

        return self

    def predict(self, adataVal, y_true_col='disease'):

        #adataVal.obs.loc[:,y_true_col] = self.le.transform(adataVal.obs[y_true_col])
        
        cell_type_pred = []
        
        X_val = adataVal.X

        if self.use_scaler:
            X_val = self.scaler.transform(X_val)
        
        y_pred = self.clfs.predict(X_val)

        cell_type_pred = pd.DataFrame({self.sample_id_col: adataVal.obs[self.sample_id_col],
                                       self.y_true_col: adataVal.obs[self.y_true_col],
                                       f'disease_pred_{self.ct}': y_pred}).set_index(self.sample_id_col)

        
        # cell_type_pred['disease_pred'] = cell_type_pred.apply(_most_frequent_or_na, axis=1)

        # cell_type_pred_merged = cell_type_pred.merge(adataVal.obs.groupby([self.sample_id_col], observed=True).agg({y_true_col:'first'}), 
        #                                              left_index=True, 
        #                                              right_index=True)
        

        return cell_type_pred

class VotingDisease():
    
    def __init__(self, classifier = None, cell_type_studies = None, fixed_params = None, 
                 cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease'):
        
        self.fixed_params = fixed_params
        self.classifier = classifier
        self.ct_studies = cell_type_studies
        self.cell_type_col = cell_type_col 
        self.y_true_col = y_true_col
        self.sample_id_col = sample_id_col
        self.clfs = {}

    def fit(self, adataR = None):
        
        for ct_i in self.ct_studies.keys(): 
    
            adataR_ct = adataR[adataR.obs[cell_type_col] == ct_i]
            # adataQ_ct = adataQ[adataQ.obs[cell_type_col] == ct_i]
        
            clf_params = self.fixed_params | self.ct_studies[ct_i].best_trial.params
        
            self.clfs[ct_i] = (CellTypeDiseasePrediction(clf_params=clf_params, 
                                                         classifier=self.classifier, 
                                                         cell_type = ct_i, 
                                                         sample_id_col=self.sample_id_col, 
                                                         y_true_col = self.y_true_col)
                                .fit(adataR_ct))
        return self
            
    def predict(self, adataQ = None):
        
        y_pred_query_list = []
        
        for ct_i in self.ct_studies.keys(): 
    
            adataQ_ct = adataQ[adataQ.obs[self.cell_type_col] == ct_i]
        
            y_pred_query_list.append(self.clfs[ct_i].predict(adataQ_ct, y_true_col = self.y_true_col).iloc[:,1])
        
        cellType_prediction_df = pd.concat(y_pred_query_list, axis=1, ignore_index=False, join='outer')
        cellType_prediction_df['majority_pred'] = cellType_prediction_df.apply(_most_frequent_or_na, axis=1)

        cellType_prediction_df = (cellType_prediction_df.merge(adataQ.obs[[self.sample_id_col, self.y_true_col]].drop_duplicates(), 
                                                               left_index = True, right_on = self.sample_id_col)
                                  .set_index(self.sample_id_col))

        return cellType_prediction_df

### Loading data

In [8]:
adataPathList = glob(str(here(REFERENCEadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"{adataPathList} adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    REFERENCEpseudobulk = sc.read_h5ad(adataPathList[0])

adataPathList = glob(str(here(QUERYNadata_path)))
if len(adataPathList) != 1:
    print(adataPathList)
    raise ValueError(f"adata not found")
else:
    print(os.path.basename(adataPathList[0]))
    QUERYpseudobulk = sc.read_h5ad(adataPathList[0])

scPoli_PSEUDOBULKcell_ref_latents_VALIDATION_2_50_100_run3_tdbh923o.h5ad
scPoli_PSEUDOBULKcell_VALIDATION_2_50_100_run3_tdbh923o.h5ad


In [9]:
# Renaiming 'Level1_pred' in scGen query datasets for consistency
# and assert that 'disease' is in obs
for ad_i in [REFERENCEpseudobulk,QUERYpseudobulk]:
    ad_i.obs.rename({'Level1_pred':'Level1'},axis=1,inplace=True)
    assert('disease' in ad_i.obs.columns)

In [10]:
REFERENCEpseudobulk, QUERYpseudobulk

(AnnData object with n_obs × n_vars = 11372 × 50
     obs: 'sampleID', 'Level1', 'disease', 'n_observation',
 AnnData object with n_obs × n_vars = 2112 × 50
     obs: 'sampleID', 'Level1', 'disease', 'n_observation')

In [11]:
QUERYpseudobulk.obs.disease.unique().tolist()

['COVID', 'sepsis', 'HNSCC', 'SLE', 'healthy', 'CD', 'PS', 'RA', 'PSA']

In [12]:
QUERYpseudobulk.obs.sampleID.unique().shape

(144,)

### Removing unwanted cell-types

In [13]:
REFERENCEpseudobulkFilt = REFERENCEpseudobulk[~REFERENCEpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()
QUERYpseudobulkFilt = QUERYpseudobulk[~QUERYpseudobulk.obs['Level1'].isin(['Progenitors','Cycling_cells','Platelets','RBC'])].copy()

In [14]:
REFERENCEpseudobulkFilt, QUERYpseudobulk

(AnnData object with n_obs × n_vars = 8816 × 50
     obs: 'sampleID', 'Level1', 'disease', 'n_observation',
 AnnData object with n_obs × n_vars = 2112 × 50
     obs: 'sampleID', 'Level1', 'disease', 'n_observation')

In [15]:
REFERENCEpseudobulkFilt.obs['disease'] = REFERENCEpseudobulkFilt.obs['disease'].astype(str)

## Input data and main parameters

In [16]:
cell_type_col='Level1'
y_true_col='disease'
sample_id_col='sampleID'

**Initialize pandas dataframe to collect results**

In [17]:
METRICdf = pd.DataFrame(columns=['integration_method','embedded_space','n_embedded','n_Patient_embedded','batchKeys','annotation',
                      'classification_method',f'{query_dataset}_F1',f'{query_dataset}_BAS',f'{query_dataset}_MCC', f'{query_dataset}_TRAIN_F1'])
METRICdf

Unnamed: 0,integration_method,embedded_space,n_embedded,n_Patient_embedded,batchKeys,annotation,classification_method,VALIDATION_F1,VALIDATION_BAS,VALIDATION_MCC,VALIDATION_TRAIN_F1


### Hyper parameter tuning and classification

The aim is to define the best classifier and its corresponding hyper-parameters using **MAIN adata** as train dataset and **VALIDATION adata** as validation.

Then, with the best model/setting for each model family (i.e., SVC, kNN, NN, GBM) we will classify samples in VALIDATION, EXCLUDED and EXTERNAL datasets

In [18]:
from sklearn.model_selection import StratifiedKFold

In [19]:
sKf = StratifiedKFold(n_splits = 5)

In [20]:
Pdata = REFERENCEpseudobulkFilt.obs.groupby(sample_id_col, observed=True).agg({y_true_col:'first'}).reset_index()

In [21]:
KfoldPatientSplit = list(sKf.split(X = Pdata[sample_id_col], y=Pdata[y_true_col]))

#### LinearSVC

In [22]:
fixed_parameter_LinearSVC = ({
    'max_iter': 1000000,
    'dual': 'auto',
})

def LinearSVC_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         random_seed = 42,
                         optimization = 'F1', 
                         n_trials = 50,
                         n_job = 50):

    LinearSVC_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
                'C': trial.suggest_float('C', 1e-3, 1e5, log=True)
            }
        
            class_params |= fixed_parameter_LinearSVC
        
            # for ct_ in adataTrain.obs.Level1.unique().tolist():
            #     class_params.update({
            #         f"{ct_}_C": trial.suggest_float(f"{ct_}_C", 1e-3, 1e5, log=True),
            #     })
        
            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=LinearSVC, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        LinearSVC_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i,sampler=TPESampler(seed=random_seed))
        LinearSVC_cell_type_studies[ct_i].optimize(objective_CT, n_trials=n_trials, show_progress_bar=True, n_jobs=n_job)#catch = ValueError)

    return LinearSVC_cell_type_studies

### EXECUTE THE FUNCTION ###
LinearSVC_cell_type_studies = LinearSVC_cellTypeOptimization(adataTrain = REFERENCEpseudobulkFilt, 
                                                               KfoldPatientSplit = KfoldPatientSplit,
                                                               y_true_col = 'disease',
                                                               sample_id_col = 'sampleID',
                                                               cell_type_col = 'Level1',
                                                               optimization = 'F1',
                                                               n_trials = 100, n_job = n_job, random_seed = 42)


B


  0%|          | 0/100 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[k] = df_sub[k].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub[k] = df_sub[k].cat.remove_unused_categories()



DC


  0%|          | 0/100 [00:00<?, ?it/s]


ILC


  0%|          | 0/100 [00:00<?, ?it/s]


Mono


  0%|          | 0/100 [00:00<?, ?it/s]


Plasma


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD4_Naive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD4_NonNaive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD8_Naive


  0%|          | 0/100 [00:00<?, ?it/s]


T_CD8_NonNaive


  0%|          | 0/100 [00:00<?, ?it/s]


UTC


  0%|          | 0/100 [00:00<?, ?it/s]


pDC


  0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
clf_LinearSVC_TRAINED = (VotingDisease(classifier = LinearSVC, 
                                         cell_type_studies = LinearSVC_cell_type_studies, 
                                         fixed_params = fixed_parameter_LinearSVC, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(REFERENCEpseudobulkFilt))

In [24]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in LinearSVC_cell_type_studies.keys():
    optMetric += LinearSVC_cell_type_studies[ct_i].best_value
f1_LinearSVC_REFERENCE = optMetric / len(LinearSVC_cell_type_studies)
print(f"F1 on reference = {f1_LinearSVC_REFERENCE}")

F1 on reference = 0.6386147204419217


In [25]:
### Predict query dataset
LinearSVC_cellType_prediction_query_df = clf_LinearSVC_TRAINED.predict(QUERYpseudobulkFilt)

f1_LinearSVC_QUERY = f1_score(LinearSVC_cellType_prediction_query_df[y_true_col], 
                              LinearSVC_cellType_prediction_query_df.majority_pred, average='weighted')
BAS_LinearSVC_QUERY = balanced_accuracy_score(LinearSVC_cellType_prediction_query_df[y_true_col], 
                                              LinearSVC_cellType_prediction_query_df.majority_pred,)
MCC_LinearSVC_QUERY = matthews_corrcoef(LinearSVC_cellType_prediction_query_df[y_true_col], 
                                        LinearSVC_cellType_prediction_query_df.majority_pred,)
print(f"QUERY: F1 = {f1_LinearSVC_QUERY}, BAS = {BAS_LinearSVC_QUERY}, MCC: {MCC_LinearSVC_QUERY}")

QUERY: F1 = 0.9714940634556533, BAS = 0.9618464052287581, MCC: 0.9476261590688073




In [26]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'LinearSVC',
                                     f1_LinearSVC_QUERY, BAS_LinearSVC_QUERY,MCC_LinearSVC_QUERY,f1_LinearSVC_REFERENCE]

In [27]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(LinearSVC_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_LinearSVC.pkl"))
joblib.dump(clf_LinearSVC_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_LinearSVC.pkl"))

LinearSVC_cellType_prediction_query_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_LinearSVC.pkl"))

#### SVC

In [28]:
fixed_parameter_SVC = ({
    'max_iter': -1,
})


def SVC_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         random_seed = 42,
                         optimization = 'F1', 
                         n_trials = 300,
                         n_job = -1):

    SVC_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'kernel': trial.suggest_categorical("kernel", ['sigmoid','rbf','poly']),
                'class_weight': trial.suggest_categorical("class_weight", ['balanced',None]),
                'C': trial.suggest_float('C', 1e-3, 1e5, log=True),
            }

            class_params |= fixed_parameter_SVC
            if class_params['kernel'] == 'poly':
                class_params.update({'degree': trial.suggest_int('degree', 2,6)})    
                
            # for ct_ in adataTrain.obs.Level1.unique().tolist():
            #     class_params.update({
            #         f"{ct_}_C": trial.suggest_float(f"{ct_}_C", 1e-3, 1e5, log=True),
            #     })
        
            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=SVC, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        SVC_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i,sampler=TPESampler(seed=random_seed))
        SVC_cell_type_studies[ct_i].optimize(objective_CT, n_trials=n_trials, show_progress_bar=True, n_jobs=n_job)#catch = ValueError)

    return SVC_cell_type_studies

### EXECUTE THE FUNCTION ###
SVC_cell_type_studies = SVC_cellTypeOptimization(adataTrain = REFERENCEpseudobulkFilt, 
                                                   KfoldPatientSplit = KfoldPatientSplit,
                                                   y_true_col = 'disease',
                                                   sample_id_col = 'sampleID',
                                                   cell_type_col = 'Level1',
                                                   optimization = 'F1',
                                                   n_trials = 300, n_job = n_job, random_seed = 42)


B


  0%|          | 0/300 [00:00<?, ?it/s]


DC


  0%|          | 0/300 [00:00<?, ?it/s]


ILC


  0%|          | 0/300 [00:00<?, ?it/s]


Mono


  0%|          | 0/300 [00:00<?, ?it/s]


Plasma


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD4_Naive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD4_NonNaive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD8_Naive


  0%|          | 0/300 [00:00<?, ?it/s]


T_CD8_NonNaive


  0%|          | 0/300 [00:00<?, ?it/s]


UTC


  0%|          | 0/300 [00:00<?, ?it/s]


pDC


  0%|          | 0/300 [00:00<?, ?it/s]

In [29]:
clf_SVC_TRAINED = (VotingDisease(classifier = SVC, 
                                         cell_type_studies = SVC_cell_type_studies, 
                                         fixed_params = fixed_parameter_SVC, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(REFERENCEpseudobulkFilt))

In [30]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in SVC_cell_type_studies.keys():
    optMetric += SVC_cell_type_studies[ct_i].best_value
f1_SVC_REFERENCE = optMetric / len(SVC_cell_type_studies)
print(f"F1 on reference = {f1_SVC_REFERENCE}")

F1 on reference = 0.6052414657457176


In [31]:
### Predict query dataset
SVC_cellType_prediction_query_df = clf_SVC_TRAINED.predict(QUERYpseudobulkFilt)

f1_SVC_QUERY = f1_score(SVC_cellType_prediction_query_df[y_true_col], 
                              SVC_cellType_prediction_query_df.majority_pred, average='weighted')
BAS_SVC_QUERY = balanced_accuracy_score(SVC_cellType_prediction_query_df[y_true_col], 
                                              SVC_cellType_prediction_query_df.majority_pred,)
MCC_SVC_QUERY = matthews_corrcoef(SVC_cellType_prediction_query_df[y_true_col], 
                                        SVC_cellType_prediction_query_df.majority_pred,)
print(f"QUERY: F1 = {f1_SVC_QUERY}, BAS = {BAS_SVC_QUERY}, MCC: {MCC_SVC_QUERY}")

QUERY: F1 = 0.9027942836391512, BAS = 0.7920343137254902, MCC: 0.8520666440362853




In [32]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'SVC',
                                     f1_SVC_QUERY, BAS_SVC_QUERY,MCC_SVC_QUERY,f1_SVC_REFERENCE]

In [33]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(SVC_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_SVC.pkl"))
joblib.dump(clf_SVC_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_SVC.pkl"))

SVC_cellType_prediction_query_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_SVC.pkl"))

#### kNN

In [34]:
fixed_parameter_kNN = ({
    'n_jobs': 1
})

def kNN_cellTypeOptimization(adataTrain = None, 
                         KfoldPatientSplit = None,
                         y_true_col = 'disease',
                         sample_id_col = 'sampleID',
                         cell_type_col = 'Level1',
                         optimization = 'F1', 
                         n_job = 40):

    kNN_cell_type_studies = {}

    for ct_i in adataTrain.obs[cell_type_col].unique().tolist():

        print(f"\n{ct_i}")
        
        adataTrain_ct_i = adataTrain[adataTrain.obs[cell_type_col] == ct_i].copy()
        
        def objective_CT(trial):
            class_params = {
                'scaler': trial.suggest_categorical('scaler', [True, False]),
                'metric': trial.suggest_categorical("metric", ['cosine','euclidean']),
                'weights': trial.suggest_categorical("weights", ['uniform','distance']),
                'n_neighbors': trial.suggest_int('n_neighbors', 1, 5, log=False),
            }

            class_params |= fixed_parameter_kNN

            metric_list = []
            
            for foldK in KfoldPatientSplit:
                
                pListTrain = Pdata.iloc[foldK[0]][sample_id_col].tolist()
                pListTest = Pdata.iloc[foldK[1]][sample_id_col].tolist()
                
                clf = CellTypeDiseasePrediction(clf_params=class_params.copy(),classifier=KNeighborsClassifier, cell_type = ct_i).fit(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTrain)])
            
                y_pred_df = clf.predict(adataTrain_ct_i[adataTrain_ct_i.obs[sample_id_col].isin(pListTest)], y_true_col = y_true_col)
        
                if optimization_metric == 'BAS':
                    M = balanced_accuracy_score(y_pred_df[y_true_col], y_pred_df.disease_pred)
                else:
                    M = f1_score(y_pred_df[y_true_col], y_pred_df[f"disease_pred_{ct_i}"], average='weighted')
        
                metric_list.append(M)
            
            return sum(metric_list)/5
                
        # Optimize hyperparameters with Optuna
        kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())
        kNN_cell_type_studies[ct_i].optimize(objective_CT, show_progress_bar=False, n_jobs=40)#catch = ValueError)

    return kNN_cell_type_studies

### EXECUTE THE FUNCTION ###
kNN_cell_type_studies = kNN_cellTypeOptimization(adataTrain = REFERENCEpseudobulkFilt, 
                                                 KfoldPatientSplit = KfoldPatientSplit,
                                                 y_true_col = 'disease',
                                                 sample_id_col = 'sampleID',
                                                 cell_type_col = 'Level1',
                                                 optimization = 'F1',
                                                 n_job = min(n_job,40))


B


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



DC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



ILC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



Mono


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



Plasma


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD4_Naive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD4_NonNaive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD8_Naive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



T_CD8_NonNaive


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



UTC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())



pDC


  kNN_cell_type_studies[ct_i] = op.create_study(direction='maximize', study_name=ct_i, sampler=BruteForceSampler())


In [35]:
clf_kNN_TRAINED = (VotingDisease(classifier = KNeighborsClassifier, 
                                         cell_type_studies = kNN_cell_type_studies, 
                                         fixed_params = fixed_parameter_kNN, 
                                         cell_type_col = 'Level1', sample_id_col = 'sampleID', y_true_col = 'disease')
                           .fit(REFERENCEpseudobulkFilt))

In [36]:
### Extracting the average performance on train data
optMetric = 0
for ct_i in kNN_cell_type_studies.keys():
    optMetric += kNN_cell_type_studies[ct_i].best_value
f1_kNN_REFERENCE = optMetric / len(kNN_cell_type_studies)
print(f"F1 on reference = {f1_kNN_REFERENCE}")

F1 on reference = 0.5296988463059641


In [37]:
### Predict query dataset
kNN_cellType_prediction_query_df = clf_kNN_TRAINED.predict(QUERYpseudobulkFilt)

f1_kNN_QUERY = f1_score(kNN_cellType_prediction_query_df[y_true_col], 
                              kNN_cellType_prediction_query_df.majority_pred, average='weighted')
BAS_kNN_QUERY = balanced_accuracy_score(kNN_cellType_prediction_query_df[y_true_col], 
                                              kNN_cellType_prediction_query_df.majority_pred,)
MCC_kNN_QUERY = matthews_corrcoef(kNN_cellType_prediction_query_df[y_true_col], 
                                        kNN_cellType_prediction_query_df.majority_pred,)
print(f"QUERY: F1 = {f1_kNN_QUERY}, BAS = {BAS_kNN_QUERY}, MCC: {MCC_kNN_QUERY}")

QUERY: F1 = 0.8803927453571126, BAS = 0.7261574074074074, MCC: 0.8348382051893278




In [38]:
### Add results to dataframe
METRICdf.loc[METRICdf.shape[0],:] = [integration_method,embSpace,n_embedded,n_Patient_embedded,batch,annotation,'kNN',
                                     f1_kNN_QUERY, BAS_kNN_QUERY,MCC_kNN_QUERY,f1_kNN_REFERENCE]

In [39]:
#### SAVE STUDY and FINAL MODEL AS PICKLE
joblib.dump(kNN_cell_type_studies, os.path.join(mainDir, f"study_objects/BestParams_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_kNN.pkl"))
joblib.dump(clf_kNN_TRAINED, os.path.join(mainDir, f"best_models/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_kNN.pkl"))

kNN_cellType_prediction_query_df.to_pickle(os.path.join(mainDir, f"label_predicted_dataframes/{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}_kNN.pkl"))

### Saving metrics dataframe

In [40]:
METRICdf.to_pickle(os.path.join(mainDir, f"metrics_dataframes/METRICdf_{integration_method}_{embSpace}_{n_embedded}nEmb_{n_Patient_embedded}nPEmb_{batch}_{annotation}_opt{optimization_metric}_{query_dataset}.pkl"))

In [41]:
METRICdf

Unnamed: 0,integration_method,embedded_space,n_embedded,n_Patient_embedded,batchKeys,annotation,classification_method,VALIDATION_F1,VALIDATION_BAS,VALIDATION_MCC,VALIDATION_TRAIN_F1
0,scPoli,cell,50,100,chemistry,Level1n2,LinearSVC,0.971494,0.961846,0.947626,0.638615
1,scPoli,cell,50,100,chemistry,Level1n2,SVC,0.902794,0.792034,0.852067,0.605241
2,scPoli,cell,50,100,chemistry,Level1n2,kNN,0.880393,0.726157,0.834838,0.529699
