# Classify B-ALL

- After every kernel restart rerun "Core"
- It's best to restart after you run a training process

## Core (Always run)

Imports

In [None]:
import cudf

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import time
import os

import shap
import optuna

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, auc, roc_curve, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

Global Variables

In [None]:
start_time = time.time()

path_to_data = "data/"

### Preprocess

Load Datasets

In [None]:
df_b_all = cudf.read_parquet(f"{path_to_data}B_ALL.pq") # Sample names is column
df_all = cudf.read_parquet(f"{path_to_data}ALL.pq") # Sample names is column

#### Merging

In [None]:
b_all_length = len(df_b_all.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns
all_length = len(df_all.columns.drop(['gene_name', 'gene_type']))  # Exclude non-numeric columns

total_length = b_all_length + all_length

df_b_all_filtered = df_b_all[df_b_all['gene_type'] == 'protein_coding']  # Filter for protein-coding genes
df_all_filtered = df_all[df_all['gene_type'] == 'protein_coding']  # Filter for protein-coding genes

# df_b_all_filtered = df_b_all  # Filter for protein-coding genes
# df_all_filtered = df_all  # Filter for protein-coding genes

df_b_all_filtered = df_b_all_filtered.drop(['gene_name', 'gene_type'], axis=1)  # Drop non-numeric columns
df_all_filtered = df_all_filtered.drop(['gene_name', 'gene_type'], axis=1)  # Drop non-numeric columns

df_b_all_filtered = df_b_all_filtered.fillna(0).select_dtypes(include='number').T
df_all_filtered = df_all_filtered.fillna(0).select_dtypes(include='number').T

print("Filtered B-ALL length:", len(df_b_all_filtered))
print("Filtered B-ALL Healthy length:", len(df_all_filtered))

combined_df = cudf.concat([df_b_all_filtered, df_all_filtered], axis=0)

combined_df['condition'] = [1] * len(df_all_filtered) + [0] * len(df_b_all_filtered)

if (len(df_b_all_filtered) + len(df_all_filtered)) != combined_df.shape[0]:
    print(f"Expected number of rows: {len(df_b_all_filtered) + len(df_all_filtered)}, Actual number of rows: {combined_df.shape[0]}")
    raise ValueError("The number of rows in the combined DataFrame does not match the sum of B-ALL and B-ALL Healthy lengths.")

In [None]:
combined_df = combined_df.to_pandas()  # Convert to pandas DataFrame for further processing

print(f"Amount of samples in the merged DataFrame: {combined_df.shape[0]}")
print(f"Amount of features in the merged DataFrame: {combined_df.shape[1]}")

Labeling

In [None]:
y = combined_df['condition']  # Use the 'condition' column as the target variable

combined_df.drop(columns=['condition'], inplace=True)  # Drop the 'condition' column for normalization

## LR (Single Test Split)

Label shuffle

In [None]:
# y = y.sample(frac=1, random_state=42).reset_index(drop=True)

Log2 Normalizing

In [None]:
merged_df_normalized = np.log2(combined_df + 1)  # Log2 transformation

Train Test Split

In [None]:
merged_df_normalized.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

x_train, x_test, y_train, y_test = train_test_split(
    merged_df_normalized, y, test_size=0.2, random_state=42, stratify=y
)

x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

print(f"Training set shape: {x_train.shape}, Test set shape: {x_test.shape}")

Scaler

In [None]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

Variance Thresholding

In [None]:
# selector_vt = VarianceThreshold(threshold=0.0)

# x_train = selector_vt.fit_transform(x_train)
# x_test = selector_vt.transform(x_test)

Smote

In [None]:
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=5)

x_train, y_train = smote.fit_resample(x_train, y_train)

In [None]:
print(f"After SMOTE, training set shape: {x_train.shape}, Test set shape: {x_test.shape}")

Optuna

In [None]:
trial_results = {"xgboost": [], "random_forest": [], "logistic_regression": []}

# Callback to record each trial's model type and its score.
def record_trial_callback(study, trial):
    classifier = trial.params.get("classifier")
    trial_results[classifier].append((trial.number, trial.value))

def optimize_classifier(x, y, n_trials=20):
    def objective(trial):
        k = trial.suggest_int("k", 5, min(50, x.shape[1] // 2))  # k is capped at half of features.
        classifier_choice = trial.suggest_categorical("classifier", ["xgboost", "random_forest", "logistic_regression"])
    
        if classifier_choice == "xgboost":
            params = {
                "max_depth": trial.suggest_int("max_depth", 3, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                # Enable GPU support:
                "tree_method": "hist",
                "device": "cuda",
                "predictor": "gpu_predictor",
                "objective": "binary:logistic",
                "eval_metric": "auc",
            }
            model = xgb.XGBClassifier(**params, use_label_encoder=False, verbosity=0)
        elif classifier_choice == "random_forest":
            params = {
                "n_estimators": trial.suggest_int("n_estimators_rf", 50, 300),
                "max_depth": trial.suggest_int("max_depth_rf", 3, 20),
                "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            }
            model = RandomForestClassifier(**params, random_state=42, class_weight="balanced")
        elif classifier_choice == "logistic_regression":
            c_value = trial.suggest_float("C", 1e-4, 1e2, log=True)
            model = LogisticRegression(C=c_value, solver="liblinear",
                                       random_state=42, class_weight="balanced", max_iter=1000)
        else:
            raise ValueError("Unsupported classifier selected.")
        
        pipeline = Pipeline([
            ("select_kbest", SelectKBest(score_func=f_classif, k=k)),
            ("classifier", model)
        ])
    
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        score = cross_val_score(pipeline, x, y, scoring="roc_auc", cv=cv).mean()
    
        if np.isnan(score):
            print("NaN score encountered, returning a low score.")
            return 0.0
        return score

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, callbacks=[record_trial_callback])
    
    print("Best parameters:", study.best_params)
    print("Best ROC-AUC:", study.best_value)
    return study

# Run the optimization.
study_result = optimize_classifier(x_train, y_train, n_trials=20)

Apply Optuna result

In [None]:
# Extract best ROC-AUC value per classifier from the recorded trials.
best_scores = {}
for clf in trial_results:
    if trial_results[clf]:
        best_trial = max(trial_results[clf], key=lambda t: t[1])
        best_scores[clf] = best_trial[1]
    else:
        best_scores[clf] = 0

# Plot the best ROC-AUC for each model.
models = list(best_scores.keys())
scores = [best_scores[m] for m in models]

plt.figure(figsize=(8, 6))
bars = plt.bar(models, scores, color=['blue', 'green', 'orange'])
plt.ylabel("Best ROC-AUC")
plt.xlabel("Classifier")
plt.title("Best ROC-AUC per Classifier from Optuna Trials")
plt.ylim(0, 1)
for bar, score in zip(bars, scores):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f'{score:.3f}', ha='center', va='bottom')

# Create the "plots" directory if it doesn't exist.
if not os.path.exists("plots"):
    os.makedirs("plots")

plot_path = os.path.join("plots", "best_classifier_comparison.png")
plt.tight_layout()
plt.savefig(plot_path)
plt.show()

print(f"Plot saved to {plot_path}")

Sanity Checks

In [None]:
print(f"Shape of x_train after feature selection: {x_train.shape}"
      f", Shape of y_test: {y_train.shape}")

In [None]:
if x_train.shape[0] != y_train.shape[0] or x_test.shape[0] != y_test.shape[0]:
    raise ValueError("Mismatch: number of samples in X_train/X_test and labels in y_train/y_test")

if sum(y_train == 1) == 0 or sum(y_train == 0) == 0:
    raise ValueError("Training set must contain both classes (B-ALL and non-B-ALL).")

if isinstance(y_train, pd.DataFrame) or isinstance(y_test, pd.DataFrame):
    raise ValueError("y_train and y_test must be Series, not DataFrames.")

Select K Best

In [None]:
best_params = study_result.best_params

select_k_best = SelectKBest(score_func=f_classif, k=best_params["k"])
x_train = select_k_best.fit_transform(x_train, y_train)
x_test = select_k_best.transform(x_test)

print(f"Shape of x_train after SelectKBest: {x_train.shape}, Shape of y_test: {y_train.shape}")

Training

In [None]:
model = RandomForestClassifier(
    n_estimators=best_params["n_estimators_rf"], 
    max_depth=best_params["max_depth_rf"], 
    criterion=best_params["criterion"],
    random_state=42, 
    class_weight="balanced"
)

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

y_proba = model.predict_proba(x_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC on test set: {roc_auc:.4f}")

In [None]:
model = xgb.XGBClassifier(
    max_depth=5,
    learning_rate=0.1,
    n_estimators=1000,
    tree_method='hist',
    device='cuda',
    predictor='gpu_predictor',
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    verbosity=0
)

model.fit(x_train, y_train)

y_pred_xgb = model.predict(x_test)

y_proba_xgb = model.predict_proba(x_test)[:, 1]

roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
print(f"ROC-AUC on test set (XGBoost): {roc_auc_xgb:.4f}")

### Statistics and Plots

Precision recall curve

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, marker='.', label=f'ROC AUC = {roc_auc:.3f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

SHAP

In [None]:
explainer   = shap.Explainer(model, x_train)      # TreeExplainer under the hood
shap_values = explainer.shap_values(x_test)       # list or 3-D array

# ---- ordinary SHAP summary for the positive class ----
shap.summary_plot(
    shap_values[1],      # <-- use only class 1 (shape: n_samples × n_features)
    x_test,
    plot_type="violin",
    max_display=20
)

In [None]:
shap.summary_plot(shap_values, x_test, plot_type="bar", max_display=20)

# B_ALL Subtype Detection

## Core (Always Run)

Imports

In [None]:
import cudf
import cuml

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import time
import random

import shap
import umap

from sklearn.cluster import KMeans, HDBSCAN
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, silhouette_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, make_scorer
from sklearn.utils import shuffle, resample
from xgboost import XGBClassifier, DMatrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from statsmodels.stats.multitest import multipletests
import plotnine as pn
import patchworklib as pw
from scipy.spatial import ConvexHull
from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu
import mygene

Global Variables

In [None]:
start_time = time.time()

path_to_data = "data/"

Load Datasets

In [None]:
df_b_all = cudf.read_parquet(f"{path_to_data}B_ALL.pq")  # Sample names is column

Preprocessing

In [None]:
df_b_all_filtered = df_b_all[df_b_all['gene_type'] == 'protein_coding']  # Filter for protein-coding genes

df_b_all_dropped = df_b_all_filtered.drop(['gene_name', 'gene_type'], axis=1)  # Drop non-numeric columns

df_b_all_transposed = df_b_all_dropped.fillna(0).select_dtypes(include='number').T

GPU to CPU Conversion

In [None]:
df_b_all_transposed = df_b_all_transposed.to_pandas()  # Convert to pandas DataFrame for further processing

print(f"Shape of df_b_all_transposed: {df_b_all_transposed.shape}")

### Normalization

log2

In [None]:
df_b_all_transposed = np.log2(df_b_all_transposed + 1)  # Log2 transformation

Scaling

In [None]:
x_scaled = StandardScaler().fit_transform(df_b_all_transposed)

Dimensionality reduction

In [None]:
pca = PCA(n_components=50, random_state=42)
x_pca = pca.fit_transform(x_scaled)

## Clustering

KMean operations

We start by doing KMeans operation N amount of times to detect the optimal amount of clusters (done later in the plot)

In [None]:
clusters_list = []
optimal_labels_kmeans = None

for i in range(1, 6):
    kmeans = KMeans(n_clusters=i, random_state=42)

    labels_kmeans = kmeans.fit_predict(x_pca)

    if i == 3:
        optimal_labels_kmeans = labels_kmeans

    df_clusters = pd.DataFrame(x_pca, columns=[f'PC{i+1}' for i in range(x_pca.shape[1])])
    df_clusters['cluster'] = labels_kmeans.astype(str)

    clusters_list.append(df_clusters)

KMeans plots

Silhouette score between 0.51 - 0.70 is prefered

(SOURCE: https://www.sciencedirect.com/science/article/pii/0377042787901257)

In [None]:
plots = []

silhouette_scores = []

for i, df in enumerate(clusters_list):
    # Compute silhouette score
    if len(df['cluster'].unique()) > 1:  # Silhouette requires at least 2 clusters
        score = silhouette_score(df[['PC1', 'PC2']], df['cluster'])
    else:
        score = float('nan')  # or 0.0 or "N/A"

    silhouette_scores.append(score)

for i, df in enumerate(clusters_list):
    hull_dfs = []

    for cluster in df['cluster'].unique():
        points = df[df['cluster'] == cluster][['PC1', 'PC2']].values

        # Must be >=3 points
        if points.shape[0] >= 3:
            try:
                hull = ConvexHull(points)
                hull_pts = points[hull.vertices]
                hull_df = pd.DataFrame(hull_pts, columns=['PC1', 'PC2'])
                hull_df['cluster'] = cluster
                hull_dfs.append(hull_df)
            except:
                continue

    centroids = df.groupby('cluster')[['PC1', 'PC2']].mean().reset_index()
    centroids['label'] = centroids['cluster'].astype(str)

    df_hulls = pd.concat(hull_dfs, ignore_index=True)

    plot = (
        pn.ggplot(df, pn.aes('PC1', 'PC2'))
        + pn.geom_point(pn.aes(color='cluster', shape='cluster'), size=1.5, alpha=0.7)
        + pn.geom_polygon(df_hulls, pn.aes('PC1', 'PC2', fill='cluster', group='cluster'), alpha=0.15, show_legend=False, color='black', linetype='dashed')
        + pn.geom_text(data=centroids, mapping=pn.aes('PC1', 'PC2', label='label'), size=8, color='black')
        + pn.scale_fill_brewer(type='qual', palette='Set2')
        + pn.scale_color_brewer(type='qual', palette='Set2')
        + pn.theme_bw()
        + pn.ggtitle(f"Clusters = {i + 1}, Silhouette Score = {silhouette_scores[i]:.3f}")
    )

    plots.append(pw.load_ggplot(plot, figsize=(4, 4)))

g04 = ((plots[4] | plots[3]) / (plots[2] | plots[1] | plots[0]))
g04.savefig()

Differential Expression Analysis - Setup

Also adjust P-values (Benjamini-Hochberg FDR correction)

SOURCE: https://physiology.med.cornell.edu/people/banfelder/qbio/resources_2008/1.5_GenespringMTC.pdf

In [None]:
# pd.set_option('display.float_format', lambda x: f'{x:.3e}')

expression_df = df_b_all_transposed.copy()
expression_df['cluster'] = optimal_labels_kmeans

results = []

for cluster_id in expression_df['cluster'].unique():
    in_cluster = expression_df[expression_df['cluster'] == cluster_id].drop(columns=['cluster'])
    out_cluster = expression_df[expression_df['cluster'] != cluster_id].drop(columns=['cluster'])

    valid_genes = (
        (in_cluster.var(skipna=True) > 0) |
        (out_cluster.var(skipna=True) > 0)
    )

    in_cluster = in_cluster.loc[:, valid_genes]
    out_cluster = out_cluster.loc[:, valid_genes]

    t_stats, p_vals = ttest_ind(in_cluster, out_cluster, axis=0, equal_var=False, nan_policy='omit')

    mean_diff = in_cluster.mean() - out_cluster.mean()

    _, adj_pvals, _, _ = multipletests(p_vals, method='fdr_bh')

    adj_pvals = np.clip(adj_pvals, 1e-300, 1.0)

    res = pd.DataFrame({
        'gene': in_cluster.columns,
        'cluster': cluster_id,
        'mean_diff': mean_diff.values.astype(np.float64),
        'p_value': p_vals.astype(np.float64),
        'adj_p_value': adj_pvals.astype(np.float64)
    }).sort_values(by='adj_p_value')

    results.append(res)

Differential Expression Analysis - Plots

These volcano plots show the mean difference and significance of a gene, we don't know what gene that is yet (will be done on the next step).

1. Far left or far right (large mean difference) -> Indicates strong expression change in that cluster
2. High on the y-axis (high −log₁₀ p-value) -> Indicates statistical significance

Cluster 0 and 2 are considered promising.

In [None]:
for df in results:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=df['mean_diff'], y=-np.log10(df['adj_p_value']), alpha=0.6)
    plt.axhline(-np.log10(0.05), color='red', linestyle='--')
    plt.title(f'Cluster {df["cluster"].iloc[0]} Volcano Plot')
    plt.xlabel('Mean Difference (Effect Size)')
    plt.ylabel('Significance (-log10 adjusted p-value)')
    plt.show()

Top genes extraction and visualizing via box plots

In [None]:
# Combine all clusters into a single dataframe
dea_results = pd.concat(results, ignore_index=True)

# Top N genes per cluster
top_genes_per_cluster = (
    dea_results
    .sort_values('adj_p_value')  # ensure it's sorted
    .groupby('cluster')
    .head(10)
)

# If needed, flatten into a list of unique top genes
top_gene_list = top_genes_per_cluster['gene'].unique().tolist()

# # Subset to top genes only
expression_subset = expression_df[top_gene_list].copy()

# Add cluster labels for sorting
expression_subset['cluster'] = optimal_labels_kmeans

# Sort by cluster for clean group blocks
expression_sorted = expression_subset.sort_values('cluster')

# Remove the cluster column for the heatmap
heatmap_data = expression_sorted.drop('cluster', axis=1)
heatmap_data_normalized = heatmap_data.apply(lambda x: (x - x.mean()) / x.std(), axis=0)

# APPROACH 1: Aggregate by cluster (most informative for large datasets)
# Calculate mean expression per cluster for each gene
cluster_means = expression_sorted.groupby('cluster')[top_gene_list].mean()

# Z-score normalize the cluster means
cluster_means_normalized = cluster_means.apply(lambda x: (x - x.mean()) / x.std(ddof=0), axis=0)

plt.figure(figsize=(8, 10))
sns.heatmap(cluster_means_normalized.T,  # Genes as rows, clusters as columns
            cmap='RdBu_r',
            center=0,
            annot=False,
            cbar_kws={'label': 'Z-score (cluster mean)'},
            xticklabels=[f'Cluster {i}' for i in cluster_means.index],
            yticklabels=True)
plt.title('Mean Gene Expression by Cluster')
plt.xlabel('Clusters')
plt.ylabel('Genes')
plt.tight_layout()
plt.show()

Annotation

In [None]:
top_genes_per_cluster = (
    dea_results
    .sort_values('adj_p_value')  # ensure it's sorted
    .groupby('cluster')
    .head(40)
)

# If needed, flatten into a list of unique top genes
top_gene_list = top_genes_per_cluster['gene'].unique().tolist()

mg = mygene.MyGeneInfo()

# Remove version numbers from Ensembl IDs if present (e.g., ENSG00000276672.1 → ENSG00000276672)
top_genes = [gene.split('.')[0] for gene in top_gene_list]

# Query annotations
results = mg.querymany(top_genes, scopes='ensembl.gene', fields=['symbol', 'name', 'summary', 'entrezgene'], species='human')

annotation_df = pd.DataFrame(results)
annotation_df = annotation_df[['query', 'symbol', 'name', 'summary']]
annotation_df.dropna(inplace=True)

print(annotation_df["symbol"].tolist())

## Classifier

In [None]:
# Top 30 unique genes across all clusters

top_genes_per_cluster = (
    dea_results
    .sort_values('adj_p_value')  # ensure it's sorted
    .groupby('cluster')
    .head(10)
)

# If needed, flatten into a list of unique top genes
top_gene_list = top_genes_per_cluster['gene'].unique().tolist()

X = expression_df[top_gene_list]
y = optimal_labels_kmeans

# np.random.shuffle(y)  # Shuffle labels for demonstration

print(f"Using {len(top_gene_list)} unique genes as features")

print(f"Shape of X: {X.shape}")

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)   

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_test_bin = label_binarize(y_test, classes=sorted(set(y)))
y_proba = clf.predict_proba(X_test)

auc_score = roc_auc_score(y_test_bin, y_proba, average="macro", multi_class="ovr")
print(f"Multiclass AUC (OvR, macro-average): {auc_score:.3f}")

n_classes = y_test_bin.shape[1]
fpr = {}
tpr = {}
roc_auc = {}

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot
plt.figure(figsize=(8, 6))
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Cluster {i} (AUC = {roc_auc[i]:.2f})')
    
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Multiclass ROC Curve")
plt.legend()
plt.tight_layout()
plt.show()

Confustion Matrix

In [None]:
y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

# Create display
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Raw counts
disp1 = ConfusionMatrixDisplay(confusion_matrix=cm, 
                              display_labels=[f'Cluster {i}' for i in sorted(set(y))])
disp1.plot(ax=ax1, cmap='Blues', values_format='d')
ax1.set_title('Confusion Matrix (Raw Counts)')

# Normalized (percentages)
cm_normalized = confusion_matrix(y_test, y_pred, normalize='true')
disp2 = ConfusionMatrixDisplay(confusion_matrix=cm_normalized,
                              display_labels=[f'Cluster {i}' for i in sorted(set(y))])
disp2.plot(ax=ax2, cmap='Blues', values_format='.2f')
ax2.set_title('Confusion Matrix (Normalized)')

plt.tight_layout()
plt.show()

# Breast Cancer

## Core (Always Run)

Imports

In [None]:
import cudf
import cuml

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import time
import random

import shap
import umap

from sklearn.cluster import KMeans, HDBSCAN
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, silhouette_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, make_scorer
from sklearn.utils import shuffle, resample
from xgboost import XGBClassifier, DMatrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from statsmodels.stats.multitest import multipletests
import plotnine as pn
import patchworklib as pw
from scipy.spatial import ConvexHull
from scipy.stats import ttest_ind, ks_2samp, mannwhitneyu
import mygene

Global Variables

In [None]:
start_time = time.time()

path_to_data = "data/"

Preprocess

In [None]:
df_breast_cancer = cudf.read_parquet(f"{path_to_data}breast_cancer.pq")  # Sample names is column
df_breast_cancer_healthy = cudf.read_parquet(f"{path_to_data}breast_cancer_healthy.pq")  # Sample names is column