In [None]:
import numpy as np
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
import pickle
import joblib
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load your VST-transformed data
# Assume vst_data is a DataFrame with samples as rows, genes as columns
# and labels is a Series with binary classification labels (healthy/unhealthy)

# Replace with your actual data loading
vst_data = pd.read_csv('data/normalised dataset/final_vst_normalised_data.csv', index_col=0)  # samples x genes
labels = pd.read_csv('data/metadata/simple_metadata_for_pydeseq2.csv', index_col=0)['condition']  # condition column

# Convert string labels to binary (healthy=0, unhealthy=1)
label_mapping = {'healthy': 0, 'unhealthy': 1}
labels = labels.map(label_mapping)

print(f"Data shape: {vst_data.shape}")
print(f"Labels distribution after encoding: {labels.value_counts()}")
print(f"Features (genes): {vst_data.shape[1]}")
print(f"Samples: {vst_data.shape[0]}")

# Basic data validation
assert vst_data.shape[0] == len(labels), "Mismatch between samples and labels"
assert set(labels.unique()) == {0, 1}, "Labels should be binary (0, 1) after encoding"
assert not labels.isnull().any(), "Labels contain missing values"

In [None]:
if 'condition' in vst_data.columns:
    print("WARNING: Found a 'condition' column in the VST data. This is unexpected and will be dropped before optimization.")
    vst_data.drop(columns=['condition'], inplace=True)
    print(f"Shape of VST data after dropping 'condition' column: {vst_data.shape}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import shap

bst = XGBClassifier(
            n_estimators=250,
            max_depth=3,
            learning_rate=0.1, log=True,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=4,
            reg_lambda=4,
            scale_pos_weight=1,
            random_state=42,
            n_jobs=-1,
            eval_metric='logloss',
            base_score=0.5,
        )

X_train, X_test, y_train, y_test = train_test_split(vst_data[['ENSG00000000003', 'ENSG00000001629']], labels[labels], test_size=.2, random_state=42)

bst.fit(X_train, y_train)

preds = bst.predict(X_test)

print(accuracy_score(y_test, preds))

explainer = shap.Explainer(bst)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)