In [395]:
import pandas as pd
import numpy as np
import shelve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
from sklearn.metrics import roc_curve, auc


from keras_tuner import RandomSearch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [396]:
# Configurations 

DEBUG, INFO, ERROR = 1, 2, 4
MASK = DEBUG | INFO | ERROR

# Uncomment this line to toggle show DEBUG Messages; or add it in a cell where u want to toggle it 
MASK ^= DEBUG 
# Uncomment this line to toggle show INFO Messages; or add it in a cell where u want to toggle it 
# MASK ^= INFO 
# Uncomment this line to toggle show ERROR Messages; or add it in a cell where u want to toggle it 
MASK ^= ERROR 

In [397]:
# Initializations 
tf.keras.backend.clear_session()

assay_list = None 
with shelve.open('../DataStore/store') as f:
    assay_list = f['assay_list']

chemical_fingerprint_data = pd.read_csv('../Datasets/fingerprint_filtered_9875.csv').drop(["Unnamed: 0"], axis = 1)

# Preprocessing; array string to an np array 

chemical_fingerprint_data['fp'] = chemical_fingerprint_data['fp'].apply(lambda x: np.array([int(num) for num in x.strip('[]').split()]))
chemical_fingerprint_data


Unnamed: 0,InChICode_standardised,588458,588334,2642,2156,2330,2216,743015,504444,894,...,720579,720533,720542,720580,720504,720532,1159524,1117304,1117305,fp
0,InChI=1S/C14H13N5O5S2/c1-2-5-3-25-12-8(11(21)1...,1.0,,,,,,,,,...,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,InChI=1S/C22H21NO2S/c23-20(21(24)25)16-26-22(1...,1.0,,,,,,,,,...,,,,,,,,,0.0,"[1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,InChI=1S/C15H8O7/c16-6-3-8-12(10(18)4-6)14(20)...,1.0,,,,,,,,1.0,...,,,,,,0.0,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,InChI=1S/C8H10O4/c1-5(2)8(10)6(11-3)4-7(9)12-8...,1.0,,,,,,,,0.0,...,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,InChI=1S/C14H14O4S2/c15-5-7-19-13-11(17)9-3-1-...,1.0,,,,1.0,,,1.0,1.0,...,,,,,,,,,,"[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9871,InChI=1S/C11H8O3/c1-7(12)9-6-8-4-2-3-5-10(8)14...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9872,InChI=1S/C15H18N2/c1-2-7-14-12(5-1)13-6-3-4-11...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9873,InChI=1S/C33H44N4O4/c1-22-18-37(23(2)20-38)32(...,,,,,,,,,,...,,,,,,,,1.0,1.0,"[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9874,InChI=1S/C18H16NO/c1-14-11-12-19(17-10-6-5-9-1...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [398]:
def chemnet(hp):
    model = Sequential()
    # 157 input nodes since there are 157 fp features for input ( out of 2048 features )
    model.add(Input(shape=(157,)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))

    # Last Layer
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', AUC()])
    return model 

In [399]:
def train_chemnet(fingerprint_data, assay):
    global final_prediction_for_fp  # Ensure this is initialized somewhere in the notebook
    
    (DEBUG & MASK) and print("Assay : ", assay, "\n", fingerprint_data['fp'].values)
    
    X = np.stack(fingerprint_data['fp'].values)
    y = fingerprint_data[assay].values
    indices = fingerprint_data.index.values  # Capture the original indices

    # Split data while preserving indices
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.2, random_state=42)

    (INFO & MASK) and print(f'Assay: {assay}\nTotal Compounds: {len(X)}')
    (DEBUG & MASK) and print(f'Train, Test Shapes: {[i.shape for i in [X_train, X_test, y_train, y_test]]}')

    tuner = RandomSearch(
        chemnet,
        objective='val_accuracy',
        max_trials=10,  # Number of different configurations to try
        executions_per_trial=3,  # Number of times to train each configuration
        directory='chemNetTrained',  # Directory to save logs and models
        project_name='fingerprint_tuning'
    )

    tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

    best_model = tuner.get_best_models(num_models=1)[0]

    predicted_values = best_model.predict(X_test).flatten()  # Assuming binary classification for simplicity

    # Convert predictions to final values based on a threshold, e.g., 0.5
    # threshold = 0.5
    fpr, tpr, thresholds = roc_curve(y_test, predicted_values)

    # A possible metric for consideration
    roc_auc = auc(fpr, tpr)
    
    # Find the optimal threshold: the threshold corresponding to the point on the ROC curve that is closest to (0,1)
    optimal_idx = np.argmax(tpr - fpr)
    threshold = thresholds[optimal_idx]

    final_values = (predicted_values >= threshold).astype(int)

    # Add results to the global DataFrame
    results = pd.DataFrame({
        'assay': assay,
        'index': indices_test,
        'predicted_value': predicted_values,
        'final_value': final_values,
        'expected_value': y_test,
        'match': final_values == y_test 
    })

    if final_prediction_for_fp is None:
        final_prediction_for_fp = results
    else:
        final_prediction_for_fp = pd.concat([final_prediction_for_fp, results], ignore_index=True)

    auc_metric = AUC()
    auc_metric.update_state(y_test, predicted_values)
    auc_score = auc_metric.result().numpy()

    (INFO & MASK) and print(f'Test AUC: {auc_score}')
    (INFO & MASK) and print(f"Assay: {assay}, Test AUC-ROC: {roc_auc}")
    (INFO & MASK) and print(f"Best Threshold: {threshold}")
    


In [400]:
final_prediction_for_fp = None

for assay in assay_list:
    # Contains only rows of a particular `assay` whose bioactivity score is present
    indices_of_not_null_values = pd.notnull(chemical_fingerprint_data[assay])
    filtered_assay_without_null_values = chemical_fingerprint_data[indices_of_not_null_values][["fp", assay, "InChICode_standardised"]]

    (DEBUG and MASK) and print(filtered_assay_without_null_values)
    train_chemnet(filtered_assay_without_null_values, assay)
    break

Trial 10 Complete [00h 00m 05s]
val_accuracy: 0.7777777910232544

Best val_accuracy So Far: 0.7870370546976725
Total elapsed time: 00h 00m 59s
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Test AUC: 0.6650345325469971
Assay: 588458, Test AUC-ROC: 0.6653225806451613
Best Threshold: 0.07998804748058319


  trackable.load_own_variables(weights_store.get(inner_path))


In [401]:
# Metrics 

accuracy = (final_prediction_for_fp['match'] == True).mean()
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

true_positives = final_prediction_for_fp[(final_prediction_for_fp['expected_value'] == 1) & (final_prediction_for_fp['match'] == True)].shape[0]
total_positives = (final_prediction_for_fp['expected_value'] == 1).sum()
true_positive_rate = true_positives / total_positives if total_positives else 0
print(f"True Positive Rate: {true_positive_rate * 100:.2f}%")

true_negatives = final_prediction_for_fp[(final_prediction_for_fp['expected_value'] == 0) & (final_prediction_for_fp['match'] == True)].shape[0]
total_negatives = (final_prediction_for_fp['expected_value'] == 0).sum()
true_negative_rate = true_negatives / total_negatives if total_negatives else 0
print(f"True Negative Rate: {true_negative_rate * 100:.2f}%")

# len(final_prediction_for_fp[(final_prediction_for_fp['match'] == True) & (final_prediction_for_fp['expected_value'] == 0)])

# # len(final_prediction_for_fp)

Overall Accuracy: 56.67%
True Positive Rate: 85.71%
True Negative Rate: 43.55%


In [402]:
final_prediction_for_fp.to_csv('../Datasets/finger_print_predictions.csv')