In [14]:
CELL_TYPE = 'Mono'
N_TRIALS: int = 1
RUN_NAME: str = "test"
BUCKET_DIRPATH: str = ""

In [15]:
if RUN_NAME != "":
    RUN_NAME = RUN_NAME + "_"

In [2]:
import os
import sys
from pyprojroot.here import here

import pandas as pd
import anndata as ad
import numpy as np
import math
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

import optuna

import joblib
import pickle
import datetime

import collections

import xgboost
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as ssp
import joblib

from dotenv import load_dotenv

In [4]:
assert load_dotenv()

In [5]:
def here(fpath):
    return os.path.join(BUCKET_DIRPATH, fpath)

# LOAD DATASET

In [6]:
adata_int_ct = ad.read_h5ad(
    f'{CELL_TYPE}_adataMerged_SPECTRAgenes.log1p.h5ad',
    backed='r',
    chunk_size=25000
)

adata_unint = ad.read_h5ad(
    f'04_MAIN_geneUniverse_noRBCnPlatelets.log1p.h5ad',
    backed='r',
    chunk_size=25000
)

In [12]:
var_names_mask = adata_unint.var_names.isin(adata_int_ct.var_names).nonzero()[0]

In [16]:
train_idxs = np.load(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_train_idxs.npy'))
val_idxs = np.load(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_val_idxs.npy'))
test_idxs = np.load(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_test_idxs.npy'))

In [17]:
train_idxs.shape, val_idxs.shape, test_idxs.shape, train_idxs.shape[0]+val_idxs.shape[0]+test_idxs.shape[0]

((626471,), (217889,), (204789,), 1049149)

In [17]:
mask_train = adata_unint.obs.index.isin(adata_int_ct.obs.iloc[train_idxs].index).nonzero()[0]
mask_val = adata_unint.obs.index.isin(adata_int_ct.obs.iloc[val_idxs].index).nonzero()[0]
mask_test = adata_unint.obs.index.isin(adata_int_ct.obs.iloc[test_idxs].index).nonzero()[0]

In [18]:
X_train = adata_unint.X[mask_train, var_names_mask]
X_val = adata_unint.X[mask_val, var_names_mask]
X_test = adata_unint.X[mask_test, var_names_mask]
X_train.shape, X_val.shape, X_test.shape

((626471, 935), (217889, 935), (204789, 935))

In [19]:
y_train = adata_unint.obs.iloc[mask_train].disease.values.astype(str)
y_test = adata_unint.obs.iloc[mask_test].disease.values.astype(str)
y_val = adata_unint.obs.iloc[mask_val].disease.values.astype(str)
y_train.shape, y_test.shape, y_val.shape 

((626471,), (204789,), (217889,))

In [45]:
lenc = LabelEncoder()
y_train_enc = lenc.fit_transform(y_train)
y_val_enc = lenc.transform(y_val)
y_test_enc = lenc.transform(y_test)

### GENERATE F1 

In [46]:
def custom_f1_score(y_true, y_pred):
    return -f1_score(y_true, y_pred.argmax(1), average='weighted')

In [47]:
eval_metric=custom_f1_score
eval_metric_name='custom_f1_score'

def objective(trial):
    params = {
        'sampling_method': 'gradient_based',
        'n_estimators': 1500,
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 250),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-1, log=True),
    }
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_0-{eval_metric_name}')
    xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        early_stopping_rounds=20,
        callbacks=[pruning_callback],
        **params
    )
    xgb.fit(
        X_train, 
        y_train_enc, 
        verbose=0,
        eval_set=[(X_val, y_val_enc)],
    )
    trial.set_user_attr('best_iteration', xgb.best_iteration)

    return xgb.best_score

In [49]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
wandb_kwargs = {"project": "xgboost_unintegrated", "name": f"{RUN_NAME}{CELL_TYPE}"}
wandbc = optuna.integration.WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
study.optimize(objective, n_trials=N_TRIALS, callbacks=[wandbc], gc_after_trial=True)

[I 2024-06-27 16:04:38,921] A new study created in memory with name: no-name-d26adfd8-a10c-4a48-9746-ae4ed6768c6c
  wandbc = optuna.integration.WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

[W 2024-06-27 16:05:45,811] Trial 0 failed with parameters: {'max_depth': 9, 'min_child_weight': 238, 'subsample': 0.7587945476302645, 'colsample_bynode': 0.6387926357773329, 'learning_rate': 0.0026368755339723046} because of the following error: XGBoostError('[16:05:45] /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/tree/updater_gpu_hist.cu:781: Exception in gpu_hist: [16:05:45] /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/c_api/../data/../common/device_helpers.cuh:431: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory\n- Free memory: 1552678912\n- Requested memory: 5351158672\n\nStack trace:\n  [bt] (0) /opt/conda/envs/xgboostt/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x71) [0x7f5e60641cf1]\n  [bt] (1) /opt/conda/envs/xgboostt/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&

XGBoostError: [16:05:45] /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/tree/updater_gpu_hist.cu:781: Exception in gpu_hist: [16:05:45] /home/conda/feedstock_root/build_artifacts/xgboost-split_1713397827678/work/src/c_api/../data/../common/device_helpers.cuh:431: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory
- Free memory: 1552678912
- Requested memory: 5351158672

Stack trace:
  [bt] (0) /opt/conda/envs/xgboostt/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x71) [0x7f5e60641cf1]
  [bt] (1) /opt/conda/envs/xgboostt/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x466) [0x7f5e60bfc7d6]
  [bt] (2) /opt/conda/envs/xgboostt/lib/libxgboost.so(+0x2cd5d1) [0x7f5e605dc5d1]
  [bt] (3) /opt/conda/envs/xgboostt/lib/libxgboost.so(thrust::detail::vector_base<unsigned char, dh::detail::XGBDefaultDeviceAllocatorImpl<unsigned char> >::fill_insert(thrust::detail::normal_iterator<thrust::device_ptr<unsigned char> >, unsigned long, unsigned char const&)+0x4c8) [0x7f5e60c52408]
  [bt] (4) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::HostDeviceVector<unsigned char>::Resize(unsigned long, unsigned char)+0x1bb) [0x7f5e60c52ddb]
  [bt] (5) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::EllpackPageImpl::EllpackPageImpl(xgboost::Context const*, xgboost::GHistIndexMatrix const&, xgboost::common::Span<xgboost::FeatureType const, 18446744073709551615ul>)+0x36f) [0x7f5e60cc67bf]
  [bt] (6) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::data::IterativeDMatrix::GetEllpackBatches(xgboost::Context const*, xgboost::BatchParam const&)+0x58e) [0x7f5e60cf279e]
  [bt] (7) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::tree::GradientBasedSampling::Sample(xgboost::Context const*, xgboost::common::Span<xgboost::detail::GradientPairInternal<float>, 18446744073709551615ul>, xgboost::DMatrix*)+0x105) [0x7f5e60f8dad5]
  [bt] (8) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::tree::GradientBasedSampler::Sample(xgboost::Context const*, xgboost::common::Span<xgboost::detail::GradientPairInternal<float>, 18446744073709551615ul>, xgboost::DMatrix*)+0x9d) [0x7f5e60f8537d]



Stack trace:
  [bt] (0) /opt/conda/envs/xgboostt/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x71) [0x7f5e60641cf1]
  [bt] (1) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::tree::GPUHistMaker::Update(xgboost::tree::TrainParam const*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, xgboost::common::Span<xgboost::HostDeviceVector<int>, 18446744073709551615ul>, std::vector<xgboost::RegTree*, std::allocator<xgboost::RegTree*> > const&)+0x3e9) [0x7f5e60fc77b9]
  [bt] (2) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::gbm::GBTree::BoostNewTrees(xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::DMatrix*, int, std::vector<xgboost::HostDeviceVector<int>, std::allocator<xgboost::HostDeviceVector<int> > >*, std::vector<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> >, std::allocator<std::unique_ptr<xgboost::RegTree, std::default_delete<xgboost::RegTree> > > >*)+0x821) [0x7f5e6090b1d1]
  [bt] (3) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::gbm::GBTree::DoBoost(xgboost::DMatrix*, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*, xgboost::PredictionCacheEntry*, xgboost::ObjFunction const*)+0x50d) [0x7f5e6090c1fd]
  [bt] (4) /opt/conda/envs/xgboostt/lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, std::shared_ptr<xgboost::DMatrix>)+0x353) [0x7f5e60954673]
  [bt] (5) /opt/conda/envs/xgboostt/lib/libxgboost.so(XGBoosterUpdateOneIter+0x74) [0x7f5e60624834]
  [bt] (6) /opt/conda/envs/xgboostt/lib/python3.12/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7f5f1d00ca4a]
  [bt] (7) /opt/conda/envs/xgboostt/lib/python3.12/lib-dynload/../../libffi.so.8(+0x5fea) [0x7f5f1d00bfea]
  [bt] (8) /opt/conda/envs/xgboostt/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x13509) [0x7f5f1d7ba509]



In [None]:
joblib.dump(study, here(f'03_Downstream_Analysis/05_SHAP/results/99_unintegrated/study/{RUN_NAME}{CELL_TYPE}_xgboost.pkl'))

In [None]:
n_estimators = int(study.best_trial.user_attrs['best_iteration']*1.2)
xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        n_estimators=n_estimators,
        **study.best_trial.params
    )
xgb.fit(
    ssp.vstack((X_train, X_val)), 
    np.concatenate((y_train_enc, y_val_enc)), 
    eval_set=[(X_test, y_test_enc)],
    verbose=1,
)

In [None]:
joblib.dump(xgb, (here(f'03_Downstream_Analysis/05_SHAP/results/99_unintegrated/best_model/{RUN_NAME}{CELL_TYPE}_xgb.json')))

In [None]:
(pd.DataFrame(np.array((y_test, y_test_enc, xgb.predict(X_test))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_Downstream_Analysis/05_SHAP/results/99_unintegrated/predictions/{RUN_NAME}{CELL_TYPE}_pred_test.zip')))
(pd.DataFrame(np.array((y_train, y_train_enc, xgb.predict(X_train))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_Downstream_Analysis/05_SHAP/results/99_unintegrated/predictions/{RUN_NAME}{CELL_TYPE}_pred_train.zip')))