In [8]:
import pandas as pd
import numpy as np
import shelve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_curve, auc


from keras_tuner import RandomSearch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [15]:
assay_list = ['588458', '588334', '2642', '2156', '2330', '2216', '743015', '504444', '894', '720635', '1688', '2599', '602340', '2796', '504652', '651658', '720582', '624256', '1531', '588852', '485270', '743012', '777', '504582', '504660', '2553', '743014', '1822', '938', '1529', '651610', '624466', '932', '720648', '2540', '2098', 'Novartis1', 'Novartis2', '2685', '485294', '2517', '504333', '881', '504339', '504466', '504332', '504327', '488953', '1851_2', '1851_4', '1851_1', '1851_3', '1851_5', '449750', '504847', '504834', '540317', '588453', '588590', '588795', '504845', '588856', '504832', '588855', '121', '624032', '119', '624296', '651965', '624170', '651820', '624297', '651635', '624417', '624202', '624287', '624288', '651644', '652104', '720579', '720533', '720542', '720580', '720504', '720532', '1159524', '1117304', '1117305']

cell_painting_data = pd.read_csv("../Datasets/Cell_Painting_filtered_9875.csv").drop(["Unnamed: 0"],axis = 1)
cp_features_list = list(cell_painting_data.columns[89:])
# cell_painting_data
cp_features_list

['Cells_AreaShape_MedianRadius',
 'Cells_Correlation_Correlation_DNA_AGP',
 'Cells_Correlation_Correlation_ER_AGP',
 'Cells_Correlation_K_AGP_DNA',
 'Cells_Correlation_K_DNA_AGP',
 'Cells_Correlation_K_DNA_ER',
 'Cells_Correlation_K_DNA_Mito',
 'Cells_Correlation_K_ER_DNA',
 'Cells_Correlation_K_Mito_DNA',
 'Cells_Correlation_K_RNA_DNA',
 'Cells_Correlation_K_RNA_Mito',
 'Cells_Granularity_14_AGP',
 'Cells_Granularity_14_ER',
 'Cells_Granularity_14_Mito',
 'Cells_Granularity_14_RNA',
 'Cells_Granularity_15_AGP',
 'Cells_Granularity_15_ER',
 'Cells_Granularity_15_Mito',
 'Cells_Granularity_15_RNA',
 'Cells_Granularity_16_AGP',
 'Cells_Granularity_16_ER',
 'Cells_Granularity_16_Mito',
 'Cells_Granularity_16_RNA',
 'Cells_Granularity_5_ER',
 'Cells_Granularity_5_Mito',
 'Cells_Granularity_5_RNA',
 'Cells_Granularity_6_RNA',
 'Cells_Granularity_7_RNA',
 'Cells_Intensity_IntegratedIntensityEdge_ER',
 'Cells_Intensity_IntegratedIntensityEdge_Mito',
 'Cells_Intensity_IntegratedIntensityEdge_R

In [18]:
def cellnet(hp):
    model = Sequential()
    # 184 input nodes since there are 184 Cell Painting features for input ( out of 2048 features )
    model.add(Input(shape=(184,)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))

    # Last Layer
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', AUC()])
    return model 

In [21]:
def train_cellnet(cell_painting_data, assay):
    global final_prediction_for_cp
    X = np.stack(cell_painting_data[cp_features_list].values)
    y = cell_painting_data[assay].values
    indices = cell_painting_data.index.values  # Capture the original indices

    # Split data while preserving indices
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.2, random_state=42)

    # (INFO & MASK) and print(f'Assay: {assay}\nTotal Compounds: {len(X)}')
    # (DEBUG & MASK) and print(f'Train, Test Shapes: {[i.shape for i in [X_train, X_test, y_train, y_test]]}')

    tuner = RandomSearch(
        cellnet,
        objective='val_accuracy',
        max_trials=10,  # Number of different configurations to try
        executions_per_trial=3,  # Number of times to train each configuration
        directory='cellNetTrained',  # Directory to save logs and models
        project_name='cellNet_tuning'
    )

    tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

    best_model = tuner.get_best_models(num_models=1)[0]

    predicted_values = best_model.predict(X_test).flatten()  # Assuming binary classification for simplicity

    # Convert predictions to final values based on a threshold, e.g., 0.5
    # threshold = 0.5
    fpr, tpr, thresholds = roc_curve(y_test, predicted_values)

    # A possible metric for consideration
    roc_auc = auc(fpr, tpr)
    
    # Find the optimal threshold: the threshold corresponding to the point on the ROC curve that is closest to (0,1)
    optimal_idx = np.argmax(tpr - fpr)
    threshold = thresholds[optimal_idx]

    final_values = (predicted_values >= threshold).astype(int)

    # Add results to the global DataFrame
    results = pd.DataFrame({
        'assay': assay,
        'index': indices_test,
        'predicted_value': predicted_values,
        'final_value': final_values,
        'expected_value': y_test,
        'match': final_values == y_test 
    })

    if final_prediction_for_cp is None:
        final_prediction_for_cp = results
    else:
        final_prediction_for_cp = pd.concat([final_prediction_for_cp, results], ignore_index=True)

    auc_metric = AUC()
    auc_metric.update_state(y_test, predicted_values)
    auc_score = auc_metric.result().numpy()

    print(f'Test AUC: {auc_score}')
    print(f"Assay: {assay}, Test AUC-ROC: {roc_auc}")
    print(f"Best Threshold: {threshold}")


In [22]:
final_prediction_for_cp = None

for assay in assay_list:
    # Contains only rows of a particular `assay` whose bioactivity score is present
    indices_of_not_null_values = pd.notnull(cell_painting_data[assay])
    filtered_assay_without_null_values = cell_painting_data[indices_of_not_null_values][[assay, "InChICode_standardised"] + cp_features_list]
    train_cellnet(filtered_assay_without_null_values,assay)
    break
    

Trial 10 Complete [00h 00m 09s]
val_accuracy: 0.7314814726511637

Best val_accuracy So Far: 0.7592592636744181
Total elapsed time: 00h 01m 40s
Test AUC: 0.45478111505508423
Assay: 588458, Test AUC-ROC: 0.4585253456221198
Best Threshold: 0.003145115217193961


In [23]:
# Metrics 

accuracy = (final_prediction_for_cp['match'] == True).mean()
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

true_positives = final_prediction_for_cp[(final_prediction_for_cp['expected_value'] == 1) & (final_prediction_for_cp['match'] == True)].shape[0]
total_positives = (final_prediction_for_cp['expected_value'] == 1).sum()
true_positive_rate = true_positives / total_positives if total_positives else 0
print(f"True Positive Rate: {true_positive_rate * 100:.2f}%")

true_negatives = final_prediction_for_cp[(final_prediction_for_cp['expected_value'] == 0) & (final_prediction_for_cp['match'] == True)].shape[0]
total_negatives = (final_prediction_for_cp['expected_value'] == 0).sum()
true_negative_rate = true_negatives / total_negatives if total_negatives else 0
print(f"True Negative Rate: {true_negative_rate * 100:.2f}%")

# len(final_prediction_for_fp[(final_prediction_for_fp['match'] == True) & (final_prediction_for_fp['expected_value'] == 0)])

# # len(final_prediction_for_fp)

Overall Accuracy: 41.11%
True Positive Rate: 100.00%
True Negative Rate: 14.52%
