In [3]:
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten, Input , BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
from tensorflow.keras.regularizers import l1, l2, l1_l2

import random
import tensorflow as tf

# --- Set Random Seed ---
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# --- Generator Class ---
class GeneExpressionSequence(Sequence):
    def __init__(self, X, y, batch_size=32, shuffle=True, **kwargs):
        super().__init__(**kwargs) 
        self.X = X.reshape(-1, X.shape[1], 1)
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.X))
        self.on_epoch_end()


    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        return self.X[batch_indices], self.y[batch_indices]

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# --- Model Builder for Optuna ---
def build_model(trial, input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    reg_strength = trial.suggest_float("l2", 0.001, 0.1, log=True)
    regularizer = l2(reg_strength)

    model.add(Conv1D(filters=64, kernel_size=2, strides=2, activation='relu', kernel_regularizer=regularizer ))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2, strides=2))
    model.add(Flatten())

    dense_units = trial.suggest_categorical("dense_units", [64, 128])
    model.add(Dense(dense_units, activation='relu',kernel_regularizer=regularizer))
    model.add(Dropout(0.07))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# --- Load Variant Split ---
base_path = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/data_splits")
variant_name = "5pct_80"
data_path = base_path / variant_name

X_train = pd.read_csv(data_path / "X_train.csv", index_col=0)
X_test = pd.read_csv(data_path / "X_test.csv", index_col=0)
y_train = pd.read_csv(data_path / "y_train.csv", index_col=0).squeeze()
y_test = pd.read_csv(data_path / "y_test.csv", index_col=0).squeeze()

# --- Split Test into Val + Final ---
X_val_np, X_final_np, y_val_np, y_final_np = train_test_split(
    X_test.values, y_test.values, test_size=0.5, stratify=y_test, random_state=SEED
)
X_train_np, y_train_np = X_train.values, y_train.values

# --- Objective Function for Optuna ---
def objective(trial):
    model = build_model(trial, input_shape=(X_train_np.shape[1], 1))
    train_gen = GeneExpressionSequence(X_train_np, y_train_np,
                                        batch_size=32)
    
    val_gen = GeneExpressionSequence(X_val_np, y_val_np, 
                                     batch_size=32,
                                     shuffle=False)

    early_stop = EarlyStopping(monitor='val_loss', 
                               patience=10, 
                               restore_best_weights=True, 
                               verbose=0)
    model.fit(train_gen,
             validation_data=val_gen,
             epochs=50,
             callbacks=[early_stop], verbose=0)

    y_pred = (model.predict(val_gen) > 0.5).astype(int).flatten()
    return accuracy_score(y_val_np, y_pred)

# --- Run Optuna Study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# --- Train Best Model ---
final_model = build_model(study.best_trial, input_shape=(X_train_np.shape[1], 1))
train_gen = GeneExpressionSequence(X_train_np, y_train_np, batch_size=32)
val_gen = GeneExpressionSequence(X_val_np, y_val_np, batch_size=32)

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
final_model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=[early_stop], verbose=1)

# --- Final Evaluation ---
final_test_gen = GeneExpressionSequence(X_final_np, y_final_np, batch_size=32, shuffle=False)
y_pred_final = (final_model.predict(final_test_gen) > 0.5).astype(int).flatten()

print("\nFinal Test Results on Held-Out Set")
print("Accuracy:", round(accuracy_score(y_final_np, y_pred_final), 4))
print("\nClassification Report:\n", classification_report(y_final_np, y_pred_final, zero_division=0))

sns.heatmap(confusion_matrix(y_final_np, y_pred_final), annot=True, fmt='d', cmap="PuRd")
plt.title(f"Confusion Matrix - {variant_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

[I 2025-08-05 13:19:36,127] A new study created in memory with name: no-name-43bfeb56-6134-4348-8b2a-36a952d160e7
[W 2025-08-05 13:19:43,967] Trial 0 failed with parameters: {'l2': 0.08091244139917213, 'dense_units': 128} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/tusharsingh/miniconda3/envs/ML_practice/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/lx/5td2g65s51s2qn_zywp584340000gn/T/ipykernel_56195/270488514.py", line 99, in objective
    model.fit(train_gen,
  File "/Users/tusharsingh/miniconda3/envs/ML_practice/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/tusharsingh/miniconda3/envs/ML_practice/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fi

KeyboardInterrupt: 