# Classify B-ALL

- After every kernel restart rerun "Core"
- It's best to restart after you run a training process

## Core (Always run)

Imports

In [None]:
import cudf
import cuml

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import time
import random

import shap

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
from sklearn.utils import shuffle, resample
from xgboost import XGBClassifier, DMatrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

Global Variables

In [None]:
start_time = time.time()

path_to_data = "data/"

### Preprocess

Load Datasets

In [None]:
df_b_all = cudf.read_parquet(f"{path_to_data}B_ALL.pq") # Sample names is column
df_b_all_healthy = cudf.read_parquet(f"{path_to_data}B_ALL_HEALTHY.pq") # Sample names is column

In [None]:
df_b_all_healthy.head()

#### Merging

In [None]:
# df_b_all = cudf.read_parquet(f"{path_to_data}B_ALL.pq") # Sample names is column
# df_b_all_healthy = cudf.read_parquet(f"{path_to_data}B_ALL_healthy.pq") # Sample names is column
# df_mixed_all = cudf.read_parquet(f"{path_to_data}ALL.pq") # Sample names is column
# df_aml_all = cudf.read_parquet(f"{path_to_data}AML.pq") # Sample names is column

b_all_length = len(df_b_all.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns
b_all_healthy_length = len(df_b_all_healthy.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns
# mixed_all_length = len(df_mixed_all.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns
# aml_all_length = len(df_aml_all.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns
# total_length = b_all_length + mixed_all_length + aml_all_length
# total_length = b_all_length + mixed_all_length
total_length = b_all_length + b_all_healthy_length

print("B-ALL length:", len(df_b_all))
print("B-ALL Healthy length:", len(df_b_all_healthy))

df_b_all_filtered = df_b_all[df_b_all['gene_type'] == 'protein_coding']  # Filter for protein-coding genes
df_b_all_healthy_filtered = df_b_all_healthy[df_b_all_healthy['gene_type'] == 'protein_coding']  # Filter for protein-coding genes

print("Filtered B-ALL length:", len(df_b_all_filtered))
print("Filtered B-ALL Healthy length:", len(df_b_all_healthy_filtered))

df_b_all_filtered = df_b_all_filtered.drop(['gene_name', 'gene_type'], axis=1)  # Drop non-numeric columns
df_b_all_healthy_filtered = df_b_all_healthy_filtered.drop(['gene_name', 'gene_type'], axis=1)  # Drop non-numeric columns

df_b_all_filtered = df_b_all_filtered.fillna(0).select_dtypes(include='number').T
df_b_all_healthy_filtered = df_b_all_healthy_filtered.fillna(0).select_dtypes(include='number').T

# df_b_all_filtered.head()

combined_df = cudf.concat([df_b_all_filtered, df_b_all_healthy_filtered], axis=0)

# combined_df.tail()

combined_df['condition'] = [1] * len(df_b_all_healthy_filtered) + [0] * len(df_b_all_filtered)

# combined_df.head()

if (len(df_b_all_filtered) + len(df_b_all_healthy_filtered)) != combined_df.shape[0]:
    print(f"Expected number of rows: {len(df_b_all_filtered) + len(df_b_all_healthy_filtered)}, Actual number of rows: {combined_df.shape[0]}")
    raise ValueError("The number of rows in the combined DataFrame does not match the sum of B-ALL and B-ALL Healthy lengths.")

# merged_df = df_b_all.merge(df_mixed_all, on='gene_id', how='inner')
# merged_df = merged_df.merge(df_aml_all, on='gene_id', how='inner')

# merged_df = merged_df[merged_df['gene_type'] == 'protein_coding']  # Filter for protein-coding genes

# gene_names = merged_df['gene_name_x']

# merged_df_transposed = merged_df.fillna(0)  # Fill NaN values with 0
# merged_df_transposed = merged_df.select_dtypes(include='number').T

# if total_length != merged_df_transposed.shape[0]:
#     print(f"Expected number of rows: {total_length}, Actual number of rows: {merged_df_transposed.shape[0]}")
#     raise ValueError("The number of rows in the merged DataFrame does not match the sum of B-ALL and mixed ALL lengths.")

In [None]:
combined_df = combined_df.to_pandas()  # Convert to pandas DataFrame for further processing

print(f"Amount of features in the merged DataFrame: {combined_df.shape[1]}")

In [None]:
# merged_df_transposed = merged_df_transposed.to_pandas()

# gene_names = gene_names.to_pandas().reset_index(drop=True)

# print(f"Amount of features in the merged DataFrame: {merged_df_transposed.shape[1]}")

## XGBoost (Single Test Split)

PU Labeling

In [None]:
# y = pd.Series([1] * b_all_length + [0] * (mixed_all_length + aml_all_length), index=merged_df_transposed.index)

y = combined_df['condition']  # Use the 'condition' column as the target variable

# y = pd.Series([1] * b_all_length + [0] * (mixed_all_length), index=merged_df_transposed.index)

Sanity Check

In [None]:
# assert merged_df_transposed.shape[0] == len(y), "Mismatch: number of samples in X and labels in y"
# assert merged_df_transposed.index.equals(y.index), "Mismatch: index order between X and y"

Label shuffle

In [None]:
# y = y.sample(frac=1, random_state=42).reset_index(drop=True)

Log2 Normalizing

In [None]:
merged_df_normalized = np.log2(combined_df + 1)  # Log2 transformation

# merged_df_normalized = np.log2(merged_df_transposed + 1)  # Log2 transformation

Train Test Split

In [None]:
# merged_df_normalized, y = shuffle(merged_df_normalized, y, random_state=42)

merged_df_normalized.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(
    merged_df_normalized, y, test_size=0.2, random_state=42, stratify=y
)

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

Undersampler and oversampler

In [None]:
rus = RandomUnderSampler(sampling_strategy={0: 350}, random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)

smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(x_train, y_train)

Scaler

In [None]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Noise

In [None]:
# noise = np.random.normal(0, 0.8, x_train.shape) * x_train

# x_train += noise

Variance Threshold (Remove constants)

In [None]:
selector_vt = VarianceThreshold(threshold=0.0)

x_train = selector_vt.fit_transform(x_train)
x_test = selector_vt.transform(x_test)

PCA

In [None]:
# selector = PCA(n_components=0.95, random_state=42)
# x_train = selector.fit_transform(x_noisy)
# x_test = selector.transform(x_test)

SelectKBest

In [None]:
selector = SelectKBest(score_func=f_classif, k=500)
x_train = selector.fit_transform(x_train, y_train)
x_test = selector.transform(x_test)

In [None]:
# train_fac = 0.1

# x_train, y_train = resample(
#     x_train, y_train, 
#     replace=False, 
#     n_samples=int(len(x_train) * train_fac),
#     stratify=y_train,
#     random_state=42
# )

# print(x_train.shape)

In [None]:
# gene_names_worst = gene_names.iloc[mixed_indices].reset_index(drop=True)

# gene_names = gene_names[selector.get_support(indices=True)].reset_index(drop=True)

Random features

In [None]:
# n_train, n_features = x_noisy.shape
# n_test = x_test.shape[0]

# x_train = np.random.normal(0, 1, size=(n_train, n_features))
# x_test = np.random.normal(0, 1, size=(n_test, n_features))

In [None]:
print(f"Shape of x_train after feature selection: {x_train.shape}"
      f", Shape of x_test: {x_test.shape}")

In [None]:
if x_train.shape[0] != y_train.shape[0] or x_test.shape[0] != y_test.shape[0]:
    raise ValueError("Mismatch: number of samples in X_train/X_test and labels in y_train/y_test")

if sum(y_train == 1) == 0 or sum(y_train == 0) == 0:
    raise ValueError("Training set must contain both classes (B-ALL and non-B-ALL).")

if isinstance(y_train, pd.DataFrame) or isinstance(y_test, pd.DataFrame):
    raise ValueError("y_train and y_test must be Series, not DataFrames.")

Logistic Regression

In [None]:
lr = LogisticRegression(
    penalty='l2',
    C=0.1,
    solver='liblinear',
    random_state=42,
    max_iter=1,
    class_weight='balanced'
)

lr.fit(x_train, y_train)

y_proba = lr.predict_proba(x_test)[:, 1]

auc_score = roc_auc_score(y_test, y_proba)

print(f"AUC: {auc_score:.3f}")

XGBClassifier

In [None]:
# scale_pos_weight = sum(y_train == 0) / sum(y_train == 1)

# xgb = XGBClassifier(
#     objective="binary:logistic",
#     eval_metric="auc",
#     use_label_encoder=False,
#     # scale_pos_weight=scale_pos_weight,  # Keep this for class imbalance
#     tree_method="hist",
#     device="cuda",
    
#     # Core parameters
#     n_estimators=500,        # More trees, let early stopping decide
#     learning_rate=0.05,      # Moderate learning rate
#     max_depth=4,             # Allow some complexity for gene interactions
    
#     # Regularization (important for high-dimensional genomics)
#     reg_lambda=5.0,          # L2 regularization
#     reg_alpha=1.0,           # L1 regularization (feature selection)
    
#     # Sampling (reduces overfitting)
#     subsample=0.8,           # Sample 80% of rows
#     colsample_bytree=0.8,    # Sample 80% of features per tree
#     colsample_bylevel=0.8,   # Additional feature sampling
    
#     # Early stopping
#     early_stopping_rounds=50,  # Stop if no improvement
    
#     # Reproducibility
#     random_state=42
# )

# xgb = XGBClassifier(
#     objective="binary:logistic",
#     eval_metric="auc",
#     use_label_encoder=False,
#     tree_method="hist",
#     device="cuda",
    
#     # Reduced model complexity
#     n_estimators=50,          # Much fewer trees (was 500)
#     learning_rate=0.3,        # Higher learning rate for faster, less precise learning
#     max_depth=2,              # Shallow trees (was 4)
    
#     # Increased regularization
#     reg_lambda=20.0,          # Much higher L2 regularization (was 5.0)
#     reg_alpha=10.0,           # Higher L1 regularization (was 1.0)
    
#     # More aggressive sampling
#     subsample=0.5,            # Sample only 50% of rows (was 0.8)
#     colsample_bytree=0.5,     # Sample only 50% of features per tree (was 0.8)
#     colsample_bylevel=0.5,    # More aggressive feature sampling (was 0.8)
    
#     # Early stopping (keep as is)
#     early_stopping_rounds=20, # Stop earlier (was 50)
    
#     # Reproducibility
#     random_state=42
# )

# xgb.fit(x_train, y_train, eval_set=[(x_test, y_test)], verbose=True)

# gpu_x_test = DMatrix(x_test, y_test)

# y_pred = xgb.get_booster().predict(gpu_x_test)

# y_proba = xgb.predict_proba(x_test)[:, 1]  # Get probabilities for the positive class

# auc_score = roc_auc_score(y_test, y_proba)

# print(f"AUC: {auc_score:.3f}")

### Statistics and Plots

Check seperator genes

In [None]:
print(x_train.shape)

gene_aucs = []

for i in range(x_train.shape[1]):
    gene_values = x_train[:, i]
    try:
        auc_value = roc_auc_score(y_train, gene_values)
        gene_aucs.append((i, auc_value))
    except ValueError:
        continue

gene_aucs = np.array(gene_aucs, dtype=[('index', int), ('auc', float)])

gene_aucs_df = pd.DataFrame({
    "gene": gene_aucs['index'],
    "auc": gene_aucs['auc']
})
gene_aucs_df["auc_diff"] = abs(gene_aucs_df["auc"] - 0.5)  # How far from random
gene_aucs_df = gene_aucs_df.sort_values("auc_diff", ascending=False)

top_genes = gene_aucs_df.head(5)['gene'].astype(int).tolist()

for i in top_genes[:5]:  # visualize top 5 for example
    # Create a DataFrame manually
    df_plot = pd.DataFrame({
        'Expression': x_train[:, i],
        'Class': y_train
    })
    sns.boxplot(data=df_plot, x='Class', y='Expression')
    plt.title(f'Expression of Gene by Class')
    plt.show()

Plot Prediction Score Distributions

In [None]:
# Wrap predictions into a DataFrame for easier slicing
df_scores = pd.DataFrame({
    "B_ALL_score": y_proba,
    "Label": y_test  # ensure matching index
})

# Plot for known B-ALL (positives)
plt.figure(figsize=(7, 4))
sns.histplot(df_scores[df_scores["Label"] == 1]["B_ALL_score"], color="blue", bins=25, kde=True)
plt.title("Predicted B-ALL Score — Known B-ALL Samples")
plt.xlabel("Predicted Probability of B-ALL")
plt.ylabel("Sample Count")
plt.tight_layout()
plt.show()

# Plot for unlabeled/mixed
plt.figure(figsize=(7, 4))
sns.histplot(df_scores[df_scores["Label"] == 0]["B_ALL_score"], color="red", bins=25, kde=True)
plt.title("Predicted B-ALL Score — Unlabeled Samples")
plt.xlabel("Predicted Probability of B-ALL")
plt.ylabel("Sample Count")
plt.tight_layout()
plt.show()

Precision recall curve

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

plt.figure(figsize=(7, 5))
plt.plot(recall, precision, marker='.', label=f'PR AUC = {pr_auc:.3f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

SHAP

In [None]:
# x_test_named = pd.DataFrame(x_test, columns=gene_names)

# x_test_named.head()

# Use the actual gene names directly
# explainer = shap.TreeExplainer(xgb)
explainer = shap.LinearExplainer(lr, x_train, feature_perturbation="interventional")
shap_values = explainer.shap_values(x_test)

shap.summary_plot(shap_values, x_test, 
                  plot_type="violin", 
                  max_display=20)

In [None]:
shap.summary_plot(shap_values, x_test, plot_type="bar", max_display=20)

## XGBoost (Stratified KFold)

### Preprocessing

#### PU Labeling

In [None]:
y = pd.Series([1] * b_all_length + [0] * (mixed_all_length), index=merged_df_transposed.index)

#### Sanity Check

In [None]:
assert merged_df_transposed.shape[0] == len(y), "Mismatch: number of samples in X and labels in y"
assert merged_df_transposed.index.equals(y.index), "Mismatch: index order between X and y"

### Training

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

x = merged_df_transposed

for fold, (train_index, val_index) in enumerate(skf.split(x, y)):
    print(f"Processing fold {fold + 1}...")

    x_train_fold, x_val_fold = x.iloc[train_index], x.iloc[val_index]
    y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

    x_train_fold = np.log2(x_train_fold + 1)  # Log2 transformation
    x_val_fold = np.log2(x_val_fold + 1)      # Log2 transformation

    scaler = StandardScaler()
    x_train_fold = scaler.fit_transform(x_train_fold)
    x_val_fold = scaler.transform(x_val_fold)

    selector = SelectKBest(score_func=f_classif, k=5)
    x_train_fold = selector.fit_transform(x_train_fold, y_train_fold)
    x_val_fold = selector.transform(x_val_fold)

    scale_pos_weight = sum(y_train_fold == 0) / sum(y_train_fold == 1)

    xgb = XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        use_label_encoder=False,
        scale_pos_weight=scale_pos_weight,  # Keep this for class imbalance
        tree_method="hist",
        device="cuda",
        
        # Core parameters
        n_estimators=500,        # More trees, let early stopping decide
        learning_rate=0.05,      # Moderate learning rate
        max_depth=4,             # Allow some complexity for gene interactions
        
        # Regularization (important for high-dimensional genomics)
        reg_lambda=5.0,          # L2 regularization
        reg_alpha=1.0,           # L1 regularization (feature selection)
        
        # Sampling (reduces overfitting)
        subsample=0.8,           # Sample 80% of rows
        colsample_bytree=0.8,    # Sample 80% of features per tree
        colsample_bylevel=0.8,   # Additional feature sampling
        
        # Early stopping
        early_stopping_rounds=50,  # Stop if no improvement
        
        # Reproducibility
        random_state=42
    )

    xgb.fit(x_train_fold, y_train_fold, eval_set=[(x_val_fold, y_val_fold)], verbose=False)

    y_proba = xgb.predict_proba(x_val_fold)[:, 1]  # Get probabilities for the positive class

    auc_fold = roc_auc_score(y_val_fold, y_proba)

    auc_scores.append(auc_fold)

    print(f"Fold {fold + 1}, AUC: {auc_fold:.3f}")

print(f"Mean AUC across all folds: {np.mean(auc_scores):.3f} ± {np.std(auc_scores):.3f}")

### Statistics and Plots

In [None]:
sns.histplot(auc_scores, kde=True, bins=5)
plt.title("Distribution of AUC Scores Across Folds")
plt.xlabel("AUC")
plt.ylabel("Number of Folds")
plt.axvline(np.mean(auc_scores), color="red", linestyle="--", label="Mean AUC")
plt.legend()
plt.show()