In [2]:
import optuna
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Flatten, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import Sequence
import random
import tensorflow as tf

# --- Set Random Seed ---
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

# --- Generator Class ---
class GeneExpressionSequence(Sequence):
    def __init__(self, X, y, batch_size=32, shuffle=True, **kwargs):
        super().__init__(**kwargs) 
        self.X = X.reshape(-1, X.shape[1], 1)
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.X))
        self.on_epoch_end()


    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        return self.X[batch_indices], self.y[batch_indices]

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

# --- Model Builder for Optuna ---
def build_model(trial, input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))

    num_filters = trial.suggest_categorical("num_filters", [32, 64,128])
    kernel_size = trial.suggest_categorical("kernel_size", [2, 4])
    stride = trial.suggest_int("stride", 1, 2)

    model.add(Conv1D(filters=num_filters, kernel_size=kernel_size, strides=stride, activation='relu'))
    model.add(MaxPooling1D(pool_size=2, strides=2))
    model.add(Flatten())

    dense_units = trial.suggest_categorical("dense_units", [64, 128])
    dropout_rate = (0.07)
    learning_rate = (1e-3)

    model.add(Dense(dense_units, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# --- Load Variant Split ---
base_path = Path("/Users/tusharsingh/Work/Project/tcga-mldl/results/data/data_splits")
variant_name = "15pct_80"
data_path = base_path / variant_name

X_train = pd.read_csv(data_path / "X_train.csv", index_col=0)
X_test = pd.read_csv(data_path / "X_test.csv", index_col=0)
y_train = pd.read_csv(data_path / "y_train.csv", index_col=0).squeeze()
y_test = pd.read_csv(data_path / "y_test.csv", index_col=0).squeeze()

# --- Split Test into Val + Final ---
X_val_np, X_final_np, y_val_np, y_final_np = train_test_split(
    X_test.values, y_test.values, test_size=0.5, stratify=y_test, random_state=SEED
)
X_train_np, y_train_np = X_train.values, y_train.values

# --- Objective Function for Optuna ---
def objective(trial):
    model = build_model(trial, input_shape=(X_train_np.shape[1], 1))
    train_gen = GeneExpressionSequence(X_train_np, y_train_np,
                                        batch_size=128)
    
    val_gen = GeneExpressionSequence(X_val_np, y_val_np, 
                                     batch_size=128,
                                     shuffle=False)

    early_stop = EarlyStopping(monitor='val_loss', 
                               patience=10, 
                               restore_best_weights=True, 
                               verbose=0)
    model.fit(train_gen,
             validation_data=val_gen,
             epochs=50,
             callbacks=[early_stop], verbose=0)

    y_pred = (model.predict(val_gen) > 0.5).astype(int).flatten()
    return accuracy_score(y_val_np, y_pred)

# --- Run Optuna Study ---
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# --- Train Best Model ---
final_model = build_model(study.best_trial, input_shape=(X_train_np.shape[1], 1))
train_gen = GeneExpressionSequence(X_train_np, y_train_np, batch_size=128)
val_gen = GeneExpressionSequence(X_val_np, y_val_np, batch_size=128)

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
final_model.fit(train_gen, validation_data=val_gen, epochs=50, callbacks=[early_stop], verbose=1)

# --- Final Evaluation ---
final_test_gen = GeneExpressionSequence(X_final_np, y_final_np, batch_size=128, shuffle=False)
y_pred_final = (final_model.predict(final_test_gen) > 0.5).astype(int).flatten()

print("\n📊 Final Test Results on Held-Out Set")
print("Accuracy:", round(accuracy_score(y_final_np, y_pred_final), 4))
print("\nClassification Report:\n", classification_report(y_final_np, y_pred_final, zero_division=0))

sns.heatmap(confusion_matrix(y_final_np, y_pred_final), annot=True, fmt='d', cmap="PuRd")
plt.title(f"Confusion Matrix - {variant_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


[I 2025-08-02 22:45:40,704] A new study created in memory with name: no-name-b8b8c644-3e30-4ff0-a0ff-747f46280eb4


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step


[I 2025-08-02 22:45:45,693] Trial 0 finished with value: 0.7450980392156863 and parameters: {'num_filters': 32, 'kernel_size': 12, 'stride': 2, 'dense_units': 128, 'dropout_rate': 0.09645630396838567, 'learning_rate': 0.00023167862312872177}. Best is trial 0 with value: 0.7450980392156863.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


[I 2025-08-02 22:45:55,182] Trial 1 finished with value: 0.7450980392156863 and parameters: {'num_filters': 64, 'kernel_size': 12, 'stride': 1, 'dense_units': 64, 'dropout_rate': 0.09772510535255806, 'learning_rate': 0.00044136506816796947}. Best is trial 0 with value: 0.7450980392156863.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


[I 2025-08-02 22:46:03,745] Trial 2 finished with value: 0.7647058823529411 and parameters: {'num_filters': 32, 'kernel_size': 4, 'stride': 2, 'dense_units': 256, 'dropout_rate': 0.08713025662595388, 'learning_rate': 0.00011212710145451034}. Best is trial 2 with value: 0.7647058823529411.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


[I 2025-08-02 22:46:10,939] Trial 3 finished with value: 0.7450980392156863 and parameters: {'num_filters': 32, 'kernel_size': 12, 'stride': 2, 'dense_units': 256, 'dropout_rate': 0.07147479454929678, 'learning_rate': 0.00019507928883512267}. Best is trial 2 with value: 0.7647058823529411.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


[I 2025-08-02 22:46:14,813] Trial 4 finished with value: 0.7647058823529411 and parameters: {'num_filters': 32, 'kernel_size': 8, 'stride': 2, 'dense_units': 128, 'dropout_rate': 0.0961339447730744, 'learning_rate': 0.0008268221162117252}. Best is trial 2 with value: 0.7647058823529411.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step


[I 2025-08-02 22:46:19,965] Trial 5 finished with value: 0.7254901960784313 and parameters: {'num_filters': 32, 'kernel_size': 12, 'stride': 2, 'dense_units': 128, 'dropout_rate': 0.07252507276449362, 'learning_rate': 0.0002470007167831865}. Best is trial 2 with value: 0.7647058823529411.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


[I 2025-08-02 22:46:26,944] Trial 6 finished with value: 0.7450980392156863 and parameters: {'num_filters': 32, 'kernel_size': 8, 'stride': 2, 'dense_units': 256, 'dropout_rate': 0.09883616088813701, 'learning_rate': 0.00027937235319243386}. Best is trial 2 with value: 0.7647058823529411.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step


[I 2025-08-02 22:47:18,383] Trial 7 finished with value: 0.7843137254901961 and parameters: {'num_filters': 128, 'kernel_size': 12, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09563390827925078, 'learning_rate': 0.00020795371139822587}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step


[I 2025-08-02 22:47:28,475] Trial 8 finished with value: 0.7450980392156863 and parameters: {'num_filters': 64, 'kernel_size': 12, 'stride': 2, 'dense_units': 128, 'dropout_rate': 0.07240693674440399, 'learning_rate': 0.0006573313575045435}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


[I 2025-08-02 22:47:42,296] Trial 9 finished with value: 0.7843137254901961 and parameters: {'num_filters': 32, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09123854401912826, 'learning_rate': 0.0007971523775618525}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step


[I 2025-08-02 22:48:05,042] Trial 10 finished with value: 0.7647058823529411 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 64, 'dropout_rate': 0.08073217388332228, 'learning_rate': 0.00013066297400830202}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


[I 2025-08-02 22:49:12,717] Trial 11 finished with value: 0.7647058823529411 and parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09052994030908802, 'learning_rate': 0.0004364887488542906}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step


[I 2025-08-02 22:50:06,711] Trial 12 finished with value: 0.7450980392156863 and parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09123386376304468, 'learning_rate': 0.0001653374010885668}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step


[I 2025-08-02 22:51:06,664] Trial 13 finished with value: 0.7647058823529411 and parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08187211761074216, 'learning_rate': 0.00044323906487469134}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step


[I 2025-08-02 22:51:54,609] Trial 14 finished with value: 0.7843137254901961 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09270649488084848, 'learning_rate': 0.00034981028924524097}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step


[I 2025-08-02 22:52:14,006] Trial 15 finished with value: 0.6862745098039216 and parameters: {'num_filters': 64, 'kernel_size': 12, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08671566036921316, 'learning_rate': 0.0009590106963682386}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


[I 2025-08-02 22:52:33,496] Trial 16 finished with value: 0.6666666666666666 and parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 64, 'dropout_rate': 0.09366347975215515, 'learning_rate': 0.0006368855076345559}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


[I 2025-08-02 22:53:44,932] Trial 17 finished with value: 0.7843137254901961 and parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08897258991086733, 'learning_rate': 0.00017084383804986294}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


[I 2025-08-02 22:54:01,774] Trial 18 finished with value: 0.7450980392156863 and parameters: {'num_filters': 32, 'kernel_size': 12, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08283884502692593, 'learning_rate': 0.0005854147587865572}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step


[I 2025-08-02 22:54:11,388] Trial 19 finished with value: 0.7058823529411765 and parameters: {'num_filters': 64, 'kernel_size': 4, 'stride': 1, 'dense_units': 64, 'dropout_rate': 0.07662463632080184, 'learning_rate': 0.0003393379726657309}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step


[I 2025-08-02 22:55:05,421] Trial 20 finished with value: 0.7450980392156863 and parameters: {'num_filters': 128, 'kernel_size': 12, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.0997708417119193, 'learning_rate': 0.00013957140007387652}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step


[I 2025-08-02 22:56:23,312] Trial 21 finished with value: 0.7647058823529411 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09281660329786014, 'learning_rate': 0.0003471178253614649}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step


[I 2025-08-02 22:57:50,637] Trial 22 finished with value: 0.7647058823529411 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09422127332177972, 'learning_rate': 0.00020516027459048034}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


[I 2025-08-02 22:58:56,990] Trial 23 finished with value: 0.7843137254901961 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08928768520594765, 'learning_rate': 0.0005122209004257172}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step


[I 2025-08-02 23:00:20,026] Trial 24 finished with value: 0.7058823529411765 and parameters: {'num_filters': 128, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.09483537648339056, 'learning_rate': 0.0002914898554529026}. Best is trial 7 with value: 0.7843137254901961.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step


[I 2025-08-02 23:00:32,872] Trial 25 finished with value: 0.7647058823529411 and parameters: {'num_filters': 32, 'kernel_size': 4, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.0924707806158096, 'learning_rate': 0.00035106103864998133}. Best is trial 7 with value: 0.7843137254901961.
[W 2025-08-02 23:01:22,796] Trial 26 failed with parameters: {'num_filters': 128, 'kernel_size': 8, 'stride': 1, 'dense_units': 256, 'dropout_rate': 0.08475338206386861, 'learning_rate': 0.0008150808705696853} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/tusharsingh/miniconda3/envs/ML_practice/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/lx/5td2g65s51s2qn_zywp584340000gn/T/ipykernel_294/771052429.py", line 101, in objective
    model.fit(train_gen,
  File "/Users/tusharsingh/miniconda3/envs/ML_practice/lib/python3.11/site

KeyboardInterrupt: 