In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tensorflow import keras
import keras_tuner as kt
from sklearn.model_selection import train_test_split

In [None]:
# Load train/test dataset and replace 'Selected'/'Random' in 'Name' with 1/0
dataframe = pd.read_csv(r'Post_WEKA\Output Data.csv')
dataframe['Name'] = dataframe['Name'].replace({'Selected': 1, 'Random': 0})

# Split dataset into features (X) and target (Y)
dataset = dataframe.values
X = dataset[:, 0:-1].astype(float)
Y = dataset[:, -1]

# Split data into training and test sets (0.8 for train, 0.2 for test)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In [4]:
def model_builder(hp):
    # Define model input with shape 476
    inputs = keras.Input(shape=(476,))
    x = inputs

    # Dynamically add dense layers based on hyperparameter 'mlplayers'
    for i in range(hp.Int("mlplayers", 2, 8)):
        x = keras.layers.Dense(
            units=hp.Int("units", 32, 512, step=32), activation="relu"
        )(x)

    # Dropout layer to reduce overfitting
    x = keras.layers.Dropout(0.2)(x)

    # Output layer with sigmoid activation for binary classification
    outputs = keras.layers.Dense(units=1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)

    # Hyperparameter tuning for learning rate
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Compile model with Adam optimizer
    model.compile(
        loss="binary_crossentropy", metrics=["accuracy"], optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
    )

    return model

In [5]:
tuner = kt.Hyperband(
    model_builder,
    overwrite=True,
    factor=3,
    objective="val_accuracy",
    directory="/tmp/tb",
)

stop_early = keras.callbacks.EarlyStopping(monitor='loss', patience=5)

In [None]:
tuner.search(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=30,
    callbacks=[stop_early, keras.callbacks.TensorBoard("/tmp/tb_logs")],
)

In [None]:
# Obtain the best hyperparameters from the tuner
best_hp = tuner.get_best_hyperparameters()[0]

# Build the model using the best hyperparameters from tuning
model = model_builder(best_hp)

# Compile the model with Adam optimizer and binary crossentropy loss
model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_hp.get('learning_rate')),
              loss="binary_crossentropy", 
              metrics=["accuracy"])

# Train the model on the training dataset
history = model.fit(x_train, y_train, epochs=6)

# Evaluate the model's performance on the test dataset
test_loss, test_accuracy = model.evaluate(x_test, y_test)

# Print the test loss and accuracy
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
# Loading the Alternative Test Data (from the external study) into numpy arrays for model testing

dataframe = pd.read_csv(r'Alternative_Test_Data\Alt_Test_Data.csv')

selected_rows = [1, 2, 3, 5, 6, 8, 11, 12, 15, 17] # These are the rows that hold the positive group miRNA (the rest hold control miRNA)

x_test_alt = dataframe.iloc[selected_rows, 1:-1].values
y_test_alt = np.ones(10)

# Negative class is represented by 0
negative_class_mask = y_test == 0

# Get the negative class records
x_test_negative = x_test[negative_class_mask]
y_test_negative = y_test[negative_class_mask]

# Concatenate the negative class records with the alternative test set
x_test_combined = np.concatenate((x_test_alt, x_test_negative[0:10]), axis=0)
y_test_combined = np.concatenate((y_test_alt, y_test_negative[0:10]), axis=0)

In [15]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

from numpy import mean

histories = []
test_accuracies = []
test_losses = []
trials = 10
best_hp = tuner.get_best_hyperparameters()[0]
tensorboard_callback = keras.callbacks.TensorBoard(log_dir="/tmp/tb_logs")
auroc_scores = []
mcc_scores = []
f1_scores = []
confusion_matrices = []


# Iterate over the number of trials to train and evaluate the model
for trial in range(trials):
    # Build the model with the best hyperparameters found by the tuner
    model = model_builder(best_hp)
    # Compile the model with Adam optimizer and binary crossentropy loss
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=best_hp.get('learning_rate')),
              loss="binary_crossentropy", 
              metrics=["accuracy"])
    # Train the model on the training data
    history = model.fit(x_train, y_train, epochs=15, callbacks=[tensorboard_callback])
    histories.append(history)
    # Record the accuracy for each trial
    trial_accuracy = history.history['accuracy']
    
    # Combine accuracies over trials
    if trial == 0:
        total_accuracies = trial_accuracy
    else:
        total_accuracies = np.vstack((total_accuracies, trial_accuracy))
    
    # Predict probabilities on the test set and calculate ROC AUC score
    y_pred_proba = model.predict(x_test_combined)
    roc_auc = roc_auc_score(y_test_combined, y_pred_proba)
    auroc_scores.append(roc_auc)

    # Convert probabilities to binary predictions and calculate confusion matrix
    y_pred_int = (y_pred_proba > 0.5).astype(int)
    confusion_matrix_trial = confusion_matrix(y_test_combined, y_pred_int)
    confusion_matrices.append(confusion_matrix_trial)

    # Evaluate the model on the test set
    test_loss, test_accuracy = model.evaluate(x_test_combined, y_test_combined)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)

    # Calculate F1 score and Matthews correlation coefficient
    f1 = f1_score(y_test_combined, y_pred_int)
    f1_scores.append(f1)
    mcc = matthews_corrcoef(y_test_combined, y_pred_int)
    mcc_scores.append(mcc)

# Calculate the average accuracy across all trials
average_accuracy = np.mean(total_accuracies, axis=0)

# Append average values of the performance metrics
test_accuracies.append(mean(test_accuracies))
test_losses.append(mean(test_losses))
auroc_scores.append(mean(auroc_scores))
mcc_scores.append(mean(mcc_scores))
f1_scores.append(mean(f1_scores))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/1

In [None]:
# Constructing the Confusion Matrices

from tabulate import tabulate

for i, cm in enumerate(confusion_matrices):
    plt.figure()
    class_labels = ['Control', 'Associated']
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix - Trial 2")
    plt.colorbar()
    tick_marks = np.arange(len(class_labels))
    plt.xticks(tick_marks, class_labels)
    plt.yticks(tick_marks, class_labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()