In [278]:
import pandas as pd
import numpy as np
import shelve
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

from keras_tuner import RandomSearch
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [279]:
# Configurations 

DEBUG, INFO, ERROR = 1, 2, 4
MASK = DEBUG | INFO | ERROR

# Uncomment this line to toggle show DEBUG Messages; or add it in a cell where u want to toggle it 
MASK ^= DEBUG 
# Uncomment this line to toggle show INFO Messages; or add it in a cell where u want to toggle it 
# MASK ^= INFO 
# Uncomment this line to toggle show ERROR Messages; or add it in a cell where u want to toggle it 
MASK ^= ERROR 

In [280]:
# Initializations 
tf.keras.backend.clear_session()

assay_list = None 
with shelve.open('../DataStore/store') as f:
    assay_list = f['assay_list']

chemical_fingerprint_data = pd.read_csv('../Datasets/fingerprint_filtered_9875.csv').drop(["Unnamed: 0"], axis = 1)

# Preprocessing 

chemical_fingerprint_data['fp'] = chemical_fingerprint_data['fp'].apply(lambda x: np.array([int(num) for num in x.strip('[]').split()]))
chemical_fingerprint_data


Unnamed: 0,InChICode_standardised,588458,588334,2642,2156,2330,2216,743015,504444,894,...,720579,720533,720542,720580,720504,720532,1159524,1117304,1117305,fp
0,InChI=1S/C14H13N5O5S2/c1-2-5-3-25-12-8(11(21)1...,1.0,,,,,,,,,...,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,InChI=1S/C22H21NO2S/c23-20(21(24)25)16-26-22(1...,1.0,,,,,,,,,...,,,,,,,,,0.0,"[1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, ..."
2,InChI=1S/C15H8O7/c16-6-3-8-12(10(18)4-6)14(20)...,1.0,,,,,,,,1.0,...,,,,,,0.0,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,InChI=1S/C8H10O4/c1-5(2)8(10)6(11-3)4-7(9)12-8...,1.0,,,,,,,,0.0,...,,,,,,,,,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,InChI=1S/C14H14O4S2/c15-5-7-19-13-11(17)9-3-1-...,1.0,,,,1.0,,,1.0,1.0,...,,,,,,,,,,"[1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9871,InChI=1S/C11H8O3/c1-7(12)9-6-8-4-2-3-5-10(8)14...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9872,InChI=1S/C15H18N2/c1-2-7-14-12(5-1)13-6-3-4-11...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9873,InChI=1S/C33H44N4O4/c1-22-18-37(23(2)20-38)32(...,,,,,,,,,,...,,,,,,,,1.0,1.0,"[1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9874,InChI=1S/C18H16NO/c1-14-11-12-19(17-10-6-5-9-1...,,,,,,,,,,...,,,,,,,,0.0,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [281]:
def chemnet(hp):
    model = Sequential()
    # 157 input nodes since there are 157 fp features for input ( out of 2048 features )
    model.add(Input(shape=(157,)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32),
            activation='relu'
        ))

    """
        * Sigmoid is chosen for binary classification ( in last layer )
        * Can be changed later to a float value predictor so that it can be fed into `bionet`

        ## Consider changing the loss calculator as well if the activation layer is changed 
    """
    model.add(Dense(1, activation='sigmoid')) 

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', AUC()])
    return model 

In [282]:
def train_chemnet(fingerprint_data, assay):
    (DEBUG & MASK) and print("Assay : ", assay , "\n", fingerprint_data['fp'].values)
    
    X = np.stack(fingerprint_data['fp'].values)
    y = fingerprint_data[assay]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    (INFO & MASK) and print(f'Assay: {assay}\nTotal Compounds: {len(X)}')
    (DEBUG & MASK) and print(f'Train, Test Shapes: {[i.shape for i in [X_train, X_test, y_train, y_test]]}')

    tuner = RandomSearch(
        chemnet,
        objective='val_accuracy',
        max_trials=5,  # Number of different configurations to try
        executions_per_trial=3,  # Number of times to train each configuration
        directory='chemNetTrained',  # Directory to save logs and models
        project_name='fingerprint_tuning'
    )

    tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

    best_model = tuner.get_best_models(num_models=1)[0]

    auc_metric = AUC()
    auc_metric.update_state(y_test, best_model.predict(X_test))
    auc_score = auc_metric.result().numpy()

    (INFO and MASK) and print(f'Test AUC: {auc_score}')


In [283]:
for assay in assay_list:
    # Contains only rows of a particular `assay` whose bioactivity score is present
    filtered_assay_without_null_values = chemical_fingerprint_data[pd.notnull(chemical_fingerprint_data[assay])][["fp", assay]]
    train_chemnet(filtered_assay_without_null_values, assay)
    break

Trial 5 Complete [00h 00m 05s]
val_accuracy: 0.7824074228604635

Best val_accuracy So Far: 0.7824074228604635
Total elapsed time: 00h 00m 27s


  trackable.load_own_variables(weights_store.get(inner_path))






[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Test AUC: 0.6552419662475586
