In [1]:
CELL_TYPE = 'Mono'
N_SPLITS: int = 5
N_TRIALS: int = 1
RUN_NAME: str = "run1"
BUCKET_DIRPATH: str = ""

In [2]:
if RUN_NAME != "":
    RUN_NAME = RUN_NAME + "_"

In [None]:
import os
import sys
#from pyprojroot.here import here

import pandas as pd
import anndata as ad
import numpy as np
import math
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

import optuna

import joblib
import pickle
import datetime

import collections

import xgboost
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as ssp
import joblib

from dotenv import load_dotenv

In [None]:
assert load_dotenv()

In [None]:
def here(fpath):
    return os.path.join(BUCKET_DIRPATH, fpath)

# LOAD DATASET

In [None]:
adata = ad.read_h5ad(
    f'{CELL_TYPE}_adataMerged_SPECTRAgenes.log1p.h5ad',
    backed='r',
    chunk_size=25000
)

In [None]:
adata.obs

### EXTRACT INDEXES FOR DATA SPLITTING (STRATIFIED BY DISEASE, GROUPED BY PATIENT)

### EXTRACT CELL COUNTS PER SPLIT (TO MAKE SURE THAT RATIOS RESEMBLE 70:20:10)

In [None]:
all_idxs = np.arange(adata.obs.shape[0])
left_out_splits = [s[1] for s in StratifiedGroupKFold(n_splits=N_SPLITS).split(all_idxs, adata.obs.disease, adata.obs.sampleID)]

min_stats = []
for val_fold_idx, test_fold_idx in product(range(N_SPLITS), range(1,N_SPLITS)):
    if val_fold_idx == test_fold_idx:
        continue

    all_idxs = np.arange(adata.obs.shape[0])
    val_idxs, test_idxs = left_out_splits[val_fold_idx], left_out_splits[test_fold_idx]
    train_idxs = np.delete(all_idxs, np.union1d(val_idxs, test_idxs))

    cell_count_df = []
    cell_count_df.append(adata.obs.iloc[train_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='train'))
    cell_count_df.append(adata.obs.iloc[val_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='val'))
    cell_count_df.append(adata.obs.iloc[test_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='test'))
    cell_count_df = pd.concat(cell_count_df).pivot(index='disease', columns='split', values='cell_count')
    cell_plot_data = cell_count_df.div(cell_count_df.sum(axis=1), axis=0).reset_index().melt(id_vars='disease')
    min_stats.append((val_fold_idx, test_fold_idx, cell_plot_data.query('split=="train"').value.min(), cell_plot_data.query('split=="test"').value.min()))

# CALCULATE RATIOS OF PATIENTS IN TRAINING AND TESTING SET: splits that best fit the expected ratios

In [None]:
split_rankings_df = (pd.DataFrame(min_stats, columns=['val_idx', 'test_idx', 'train_min', 'test_mean'])
 .assign(train_min = lambda df: df.train_min / df.train_min.max())
 .assign(test_mean = lambda df: df.test_mean / df.test_mean.max())
 .assign(ranking = lambda df: df.test_mean * df.train_min)
 .sort_values('ranking'))
split_rankings_df

In [None]:
best_split = split_rankings_df.loc[split_rankings_df['ranking'].idxmax()]
all_idxs = np.arange(adata.obs.shape[0])
val_idxs, test_idxs = left_out_splits[best_split.val_idx.astype(int)], left_out_splits[best_split.test_idx.astype(int)]
train_idxs = np.delete(all_idxs, np.union1d(val_idxs, test_idxs))

### CHECK NUMBER OF CELLS PER SPLIT

In [None]:
cell_count_df = []
cell_count_df.append(adata.obs.iloc[train_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='train'))
cell_count_df.append(adata.obs.iloc[val_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='val'))
cell_count_df.append(adata.obs.iloc[test_idxs].groupby('disease', observed=False).size().to_frame(name='cell_count').reset_index().assign(split='test'))
cell_count_df = pd.concat(cell_count_df).pivot(index='disease', columns='split', values='cell_count')
cell_plot_data = cell_count_df.div(cell_count_df.sum(axis=1), axis=0).reset_index().melt(id_vars='disease')

pat_count_df = []
pat_count_df.append(adata.obs.iloc[train_idxs].groupby('disease', observed=False).sampleID.nunique().to_frame(name='patient_count').reset_index().assign(split='train'))
pat_count_df.append(adata.obs.iloc[val_idxs].groupby('disease', observed=False).sampleID.nunique().to_frame(name='patient_count').reset_index().assign(split='val'))
pat_count_df.append(adata.obs.iloc[test_idxs].groupby('disease', observed=False).sampleID.nunique().to_frame(name='patient_count').reset_index().assign(split='test'))
pd.concat(pat_count_df).pivot(index='disease', columns='split', values='patient_count')
pat_count_df = pd.concat(pat_count_df).pivot(index='disease', columns='split', values='patient_count')
pat_count_df = pat_count_df.div(pat_count_df.sum(axis=1), axis=0)
pat_plot_data = pat_count_df.div(pat_count_df.sum(axis=1), axis=0).reset_index().melt(id_vars='disease')

train_pats = set(adata.obs.iloc[train_idxs].sampleID)
test_pats = set(adata.obs.iloc[test_idxs].sampleID)
val_pats = set(adata.obs.iloc[val_idxs].sampleID)

In [None]:
assert val_pats.intersection(test_pats) != {}
assert train_pats.intersection(test_pats) != {}
assert train_pats.intersection(val_pats) != {}

### PLOT PATIENT AND CELL DISTRIBUTION ACROSS TRAIN, TEST, AND VALIDATION SETS PER DISEASE
### CHECK THAT THEY APPROXIMATE EXPECTATION

In [None]:
_, axs = plt.subplots(1, 2, figsize=(20, 6))
sns.barplot(x='disease', y='value', hue='split', data=cell_plot_data, ax=axs[0])
axs[0].set_title('Cell distribution')
axs[0].tick_params(axis='x', labelrotation=45)
sns.barplot(x='disease', y='value', hue='split', data=pat_plot_data, ax=axs[1])
axs[1].set_title('Patients distribution')
axs[1].tick_params(axis='x', labelrotation=45)
plt.savefig(here(f'03_Downstream_Analysis/05_SHAP/results/01_split_plots/{RUN_NAME}{CELL_TYPE}_split_quality.pdf'), bbox_inches='tight', dpi=300)

In [None]:
np.save(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_train_idxs.npy'), train_idxs)
np.save(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_val_idxs.npy'), val_idxs)
np.save(here(f'03_Downstream_Analysis/05_SHAP/results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_test_idxs.npy'), test_idxs)

### SUBSET DATASET INTO TRAIN/TEST/VAL SPLITS

In [None]:
X_train = adata.X[train_idxs]
X_test = adata.X[test_idxs]
X_val = adata.X[val_idxs]
X_train.shape, X_test.shape, X_val.shape

In [None]:
y_train = adata.obs.iloc[train_idxs].disease.values.astype(str)
y_test = adata.obs.iloc[test_idxs].disease.values.astype(str)
y_val = adata.obs.iloc[val_idxs].disease.values.astype(str)
y_train.shape, y_test.shape, y_val.shape 

In [None]:
lenc = LabelEncoder()
y_train_enc = lenc.fit_transform(y_train)
y_val_enc = lenc.transform(y_val)
y_test_enc = lenc.transform(y_test)

### GENERATE F1 

In [None]:
def custom_f1_score(y_true, y_pred):
    return -f1_score(y_true, y_pred.argmax(1), average='weighted')

In [None]:
eval_metric=custom_f1_score
eval_metric_name='custom_f1_score'

def objective(trial):
    params = {
        'sampling_method': 'gradient_based',
        'n_estimators': 1500,
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 250),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 5e-1, log=True),
    }
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, f'validation_0-{eval_metric_name}')
    xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        early_stopping_rounds=20,
        callbacks=[pruning_callback],
        **params
    )
    xgb.fit(
        X_train, 
        y_train_enc, 
        verbose=0,
        eval_set=[(X_val, y_val_enc)],
    )
    trial.set_user_attr('best_iteration', xgb.best_iteration)

    return xgb.best_score

In [None]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='minimize', sampler=sampler)
wandb_kwargs = {"project": "xgboost", "name": f"{RUN_NAME}{CELL_TYPE}"}
wandbc = optuna.integration.WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs)
study.optimize(objective, n_trials=N_TRIALS, callbacks=[wandbc], gc_after_trial=True)

In [None]:
joblib.dump(study, here(f'03_Downstream_Analysis/05_SHAP/results/03_xgboost/study/{RUN_NAME}{CELL_TYPE}_xgboost.pkl'))

In [None]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.write_image(here(f'03_Downstream_Analysis/05_SHAP/results/03_xgboost/parallel_coordinate/{RUN_NAME}{CELL_TYPE}_pcoord.pdf'))

In [None]:
n_estimators = int(study.best_trial.user_attrs['best_iteration']*1.2)
xgb = xgboost.XGBClassifier(
        device='gpu',
        eval_metric=eval_metric,
        n_estimators=n_estimators,
        **study.best_trial.params
    )
xgb.fit(
    ssp.vstack((X_train, X_val)), 
    np.concatenate((y_train_enc, y_val_enc)), 
    eval_set=[(X_test, y_test_enc)],
    verbose=1,
)

In [None]:
joblib.dump(xgb, (here(f'03_Downstream_Analysis/05_SHAP/results/03_xgboost/best_model/{RUN_NAME}{CELL_TYPE}_xgb.json')))

In [None]:
(pd.DataFrame(np.array((y_test, y_test_enc, xgb.predict(X_test))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_Downstream_Analysis/05_SHAP/results/03_xgboost/predictions/{RUN_NAME}{CELL_TYPE}_pred_test.zip')))
(pd.DataFrame(np.array((y_train, y_train_enc, xgb.predict(X_train))).T, columns=['y_true', 'y_true_code', 'y_pred'])
 .to_csv(here(f'03_Downstream_Analysis/05_SHAP/results/03_xgboost/predictions/{RUN_NAME}{CELL_TYPE}_pred_train.zip')))

In [6]:
adata = ad.read_h5ad(
    f'data/{CELL_TYPE}_adataMerged_SPECTRAgenes.log1p.h5ad',
    backed='r',
    chunk_size=25000
)

In [12]:
test_idxs = np.load(f'results/02_preprocessing/{RUN_NAME}{CELL_TYPE}_test_idxs.npy')

In [33]:
predictions = pd.read_csv(f'results/targetY_disease/03_xgboost/predictions/{RUN_NAME}{CELL_TYPE}_pred_test.zip')[['y_true','y_true_code','y_pred']]
predictions.head()

Unnamed: 0,y_true,y_true_code,y_pred
0,healthy,18,13
1,healthy,18,13
2,healthy,18,13
3,healthy,18,13
4,healthy,18,13


In [39]:
predictions[['sampleID','sex','disease']] = adata.obs.iloc[test_idxs].reset_index()[['sampleID','sex','disease']]
assert all(predictions.y_true == predictions.disease)
predictions.drop('disease', axis=1, inplace=True)

In [40]:
predictions

Unnamed: 0,y_true,y_true_code,y_pred,sampleID,sex
0,healthy,18,13,Perez2022_IGTB1290_T0,female
1,healthy,18,13,Perez2022_IGTB1290_T0,female
2,healthy,18,13,Perez2022_IGTB1290_T0,female
3,healthy,18,13,Perez2022_IGTB1290_T0,female
4,healthy,18,13,Perez2022_IGTB1290_T0,female
...,...,...,...,...,...
204784,SLE,13,13,Perez2022_1771_T0,female
204785,SLE,13,13,Perez2022_1771_T0,female
204786,SLE,13,13,Perez2022_1771_T0,female
204787,SLE,13,13,Perez2022_1771_T0,female
