In [1]:
CELL_TYPE = 'pDC'
N_TRIALS: int = 50
SPLIT_IDX = 0

In [15]:
import os
import sys
from pyprojroot.here import here

import pandas as pd
import anndata as ad
import numpy as np
import math
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

import optuna

import joblib
import pickle
import datetime

import collections

import xgboost
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as ssp
import joblib

from dotenv import load_dotenv

In [16]:
assert load_dotenv()

# LOAD DATASET

In [17]:
adata_ct = ad.read_h5ad(here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/data/{CELL_TYPE}_COMBAT2022_spectraGenes_log1p.h5ad'))

In [18]:
adata_ct

AnnData object with n_obs × n_vars = 1676 × 935
    obs: 'studyID', 'libraryID', 'sampleID', 'chemistry', 'disease', 'sex', 'binned_age', 'Level1', 'Level2', 'batch', 'COVID_severity'
    var: 'hgnc_id', 'symbol', 'locus_group', 'HUGO_status', 'highly_variable'
    uns: 'log1p'

In [19]:
adata_ct.obs.studyID.unique(), adata_ct.obs.Level1.unique()

(['COMBAT2022']
 Categories (1, object): ['COMBAT2022'],
 ['pDC']
 Categories (1, object): ['pDC'])

### Generate traint - test - validation splits (5-fold cross validation setting)

In [20]:
N_SPLITS = 5

In [21]:
left_out_splits = [s[1] for s in StratifiedGroupKFold(n_splits=N_SPLITS).split(X = adata_ct.obs.index, y=adata_ct.obs.disease, groups=adata_ct.obs.batch)]

In [22]:
test_split_n = SPLIT_IDX
validation_split_n = (SPLIT_IDX + 1) % N_SPLITS

test_idxs = left_out_splits[test_split_n]
val_idxs = left_out_splits[validation_split_n]

train_idxs = list()

for idx in np.setdiff1d(range(N_SPLITS), np.array([test_split_n,validation_split_n])):
    train_idxs.append(left_out_splits[idx])
train_idxs = np.concatenate(train_idxs)

In [23]:
train_idxs.shape, test_idxs.shape, val_idxs.shape, train_idxs.shape[0]+val_idxs.shape[0]+test_idxs.shape[0]

((1023,), (389,), (264,), 1676)

In [24]:
X_train = adata_ct[train_idxs,:].X
X_test = adata_ct[test_idxs, :].X
X_val = adata_ct[val_idxs, :].X
X_train.shape, X_test.shape, X_val.shape

((1023, 935), (389, 935), (264, 935))

In [25]:
y_train = adata_ct.obs.iloc[train_idxs].disease.values.astype(str)
y_test = adata_ct.obs.iloc[test_idxs].disease.values.astype(str)
y_val = adata_ct.obs.iloc[val_idxs].disease.values.astype(str)
y_train.shape, y_test.shape, y_val.shape 

((1023,), (389,), (264,))

In [26]:
np.unique(y_train), np.unique(y_val), np.unique(y_test)

(array(['COVID_CRIT', 'COVID_MILD', 'COVID_SEV', 'Flu', 'healthy'],
       dtype='<U10'),
 array(['COVID_CRIT', 'COVID_MILD', 'COVID_SEV', 'Flu', 'healthy'],
       dtype='<U10'),
 array(['COVID_CRIT', 'COVID_MILD', 'COVID_SEV', 'Flu', 'healthy'],
       dtype='<U10'))

In [27]:
lenc = LabelEncoder()
y_train_enc = lenc.fit_transform(y_train)
y_val_enc = lenc.transform(y_val)
y_test_enc = lenc.transform(y_test)

### GENERATE F1 

In [19]:
def custom_f1_score(y_true, y_pred):
    return -f1_score(y_true, y_pred.argmax(1), average='weighted')

In [20]:
eval_metric=custom_f1_score
eval_metric_name='custom_f1_score'

def objective(trial):
    params = {
        'sampling_method': 'gradient_based',
        'n_estimators': 1500,
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 250),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-1, log=True),
    }
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_0-{eval_metric_name}')
    xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        early_stopping_rounds=20,
        callbacks=[pruning_callback],
        **params
    )
    xgb.fit(
        X_train, 
        y_train_enc, 
        verbose=0,
        eval_set=[(X_val, y_val_enc)],
    )
    trial.set_user_attr('best_iteration', xgb.best_iteration)

    return xgb.best_score

In [21]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=N_TRIALS, gc_after_trial=True)

[I 2025-04-02 20:08:54,455] A new study created in memory with name: no-name-f2e3aa13-8bc5-4ecf-8bce-6a21c7775619
[I 2025-04-02 20:10:58,892] Trial 0 finished with value: -0.745543 and parameters: {'max_depth': 9, 'min_child_weight': 238, 'subsample': 0.7587945476302645, 'colsample_bynode': 0.6387926357773329, 'learning_rate': 0.0026368755339723046}. Best is trial 0 with value: -0.745543.


In [22]:
joblib.dump(study, here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/results/study/{SPLIT_IDX}_{CELL_TYPE}_xgboost.pkl'))

['/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis/results/study/0_Mono_xgboost.pkl']

In [23]:
n_estimators = int(study.best_trial.user_attrs['best_iteration']*1.2)
xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        n_estimators=n_estimators,
        **study.best_trial.params
    )
xgb.fit(
    ssp.vstack((X_train, X_val)), 
    np.concatenate((y_train_enc, y_val_enc)), 
    eval_set=[(X_test, y_test_enc)],
    verbose=1,
)

[0]	validation_0-mlogloss:1.38401	validation_0-custom_f1_score:-0.69994
[1]	validation_0-mlogloss:1.38172	validation_0-custom_f1_score:-0.71104
[2]	validation_0-mlogloss:1.37942	validation_0-custom_f1_score:-0.72432
[3]	validation_0-mlogloss:1.37714	validation_0-custom_f1_score:-0.72384
[4]	validation_0-mlogloss:1.37491	validation_0-custom_f1_score:-0.72638
[5]	validation_0-mlogloss:1.37265	validation_0-custom_f1_score:-0.72637
[6]	validation_0-mlogloss:1.37039	validation_0-custom_f1_score:-0.72621
[7]	validation_0-mlogloss:1.36817	validation_0-custom_f1_score:-0.72612
[8]	validation_0-mlogloss:1.36591	validation_0-custom_f1_score:-0.72513
[9]	validation_0-mlogloss:1.36373	validation_0-custom_f1_score:-0.72547
[10]	validation_0-mlogloss:1.36148	validation_0-custom_f1_score:-0.72589
[11]	validation_0-mlogloss:1.35930	validation_0-custom_f1_score:-0.72487
[12]	validation_0-mlogloss:1.35717	validation_0-custom_f1_score:-0.72532
[13]	validation_0-mlogloss:1.35504	validation_0-custom_f1_sco

In [24]:
joblib.dump(xgb, (here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/results/best_model/{SPLIT_IDX}_{CELL_TYPE}_xgb.json')))

['/scratch_isilon/groups/singlecell/shared/projects/Inflammation-PBMCs-Atlas/03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis/results/best_model/0_Mono_xgb.json']

In [26]:
(pd.DataFrame(np.array((y_test, y_test_enc, xgb.predict(X_test))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/results/predictions/{SPLIT_IDX}_{CELL_TYPE}_pred_test.zip')))
(pd.DataFrame(np.array((y_train, y_train_enc, xgb.predict(X_train))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_downstream_analysis/08_gene_importance/COMBAT_focus_analysis_COVIDseverity_NOsepsis/results/predictions/{SPLIT_IDX}_{CELL_TYPE}_pred_train.zip')))