In [1]:
import sys
import warnings
import numpy as np
import pandas as pd
from loaders.Loaders import CSVLoader
from splitters.splitters import SingletaskStratifiedSplitter
from metrics.Metrics import Metric
from metrics.metricsFunctions import roc_auc_score, precision_score, accuracy_score, confusion_matrix, classification_report
from tensorflow.keras.layers import Dense, Dropout, GaussianNoise, Conv1D, Flatten, Reshape, LSTM
from models.kerasModels import KerasModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adadelta, Adam, RMSprop

import tensorflow as tf
print(tf.version.VERSION)

warnings.filterwarnings('ignore')

2.7.0


# CHECK GPU

In [2]:
tf.config.experimental.list_physical_devices('GPU')

2022-01-17 17:53:39.996292: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:40.004547: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:40.005057: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
    print('Using GPU: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Num GPUs Available:  1
Using GPU: /device:GPU:0


2022-01-17 17:53:40.026046: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-17 17:53:40.026789: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:40.027281: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:40.027716: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

# Load

In [4]:
rdk_path = '/home/laptop16/Desktop/Mestrado/2_ano/SIB/Grupo/CODE_SIB/src/smiles/dataset/binary_class/rdk_fs_all.csv'

bits = [0,5,25,34,36,55,77,78,101,103,105,106,113,117,128,129,142,175,188,194,195,196,197,200,203,211,231,270,274,278,281,291,309,335,365,382,386,388,400,406,411,419,427,438,443,461,471,492,498,515,516,522,532,549,563,586,589,594,608,620,632,636,640,668,671,674,676,684,697,701,704,707,719,728,734,738,742,759,779,791,798,799,838,841,847,853,857,865,870,876,883,887,927,935,940,955,962,972,985,992,1016,1024,1037,1043,1045,1051,1072,1097,1104,1113,1123,1130,1133,1134,1145,1155,1180,1184,1222,1238,1255,1265,1287,1288,1295,1302,1305,1338,1341,1342,1346,1347,1369,1374,1375,1377,1389,1400,1403,1420,1421,1451,1456,1472,1479,1483,1492,1494,1500,1501,1520,1522,1528,1533,1536,1542,1545,1548,1556,1579,1599,1605,1612,1652,1656,1667,1670,1672,1676,1681,1682,1685,1686,1699,1705,1707,1727,1732,1734,1771,1778,1792,1818,1820,1823,1832,1846,1849,1861,1867,1872,1876,1885,1889,1892,1915,1921,1931,1942,1971,1973,1992,1995,2003,2041]

str_bits = list(str(x) for x in bits)

In [5]:
loader = CSVLoader(rdk_path, mols_field='smiles', labels_fields='activity', features_fields=str_bits)

In [6]:
dataset = loader.create_dataset()
dataset.get_shape()

Mols_shape:  39990
Features_shape:  (39990, 205)
Labels_shape:  (39990,)


# Data Split

In [7]:
splitter = SingletaskStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset=dataset, frac_train=0.6,
                                                                             frac_valid=0.2, frac_test=0.2)

(23994, 205)
(7998, 205)
(7998, 205)


# Model

## DNN

In [8]:
input_dim = train_dataset.X.shape[1]

def create_model(optimizer='adam', dropout=0.5, input_dim=input_dim):
    # create model
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    return model

In [9]:
input_dim = train_dataset.X.shape[1]
print(f'Input dim: {input_dim}')
model = KerasModel(create_model, epochs=30, verbose=1, optimizer='adam')

Input dim: 205


In [10]:
print(f'Train data size: {train_dataset.X.shape}\nTrain labels size: {train_dataset.y.shape}')

model.fit(train_dataset)

Train data size: (23994, 205)
Train labels size: (23994,)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1024)              210944    
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense_1 (Dense)             (None, 256)               262400    
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 8)                 520       
                                                                 
 dense_4 (Dense)             (None, 1)                 9         
                                                                

2022-01-17 17:53:42.313729: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:42.314211: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:42.314613: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:42.315063: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-17 17:53:42.315743: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from S

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
metrics = [Metric(roc_auc_score),
           Metric(precision_score),
           Metric(accuracy_score),
           Metric(confusion_matrix),
           Metric(classification_report)]


print('training set score:', model.evaluate(train_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

roc_auc_score: 
 0.9517019343004838
precision_score: 
 0.932051997766967
accuracy_score: 
 0.9516962573976827
confusion_matrix: 
 [[11148   852]
 [  307 11687]]
classification_report: 
               precision    recall  f1-score   support

           0       0.97      0.93      0.95     12000
           1       0.93      0.97      0.95     11994

    accuracy                           0.95     23994
   macro avg       0.95      0.95      0.95     23994
weighted avg       0.95      0.95      0.95     23994

training set score: {'roc_auc_score': 0.9517019343004838, 'precision_score': 0.932051997766967, 'accuracy_score': 0.9516962573976827, 'confusion_matrix': 5998.5, 'classification_report': None}
roc_auc_score: 
 0.8001910955477738
precision_score: 
 0.7802146523565096
accuracy_score: 
 0.8002000500125032
confusion_matrix: 
 [[3056  942]
 [ 656 3344]]
classification_report: 
               precision    recall  f1-score   support

           0       0.82      0.76      0.79      3998
  

## CNN

In [12]:
def make_cnn_model(input_dim=input_dim,
                   g_noise = 0.05,
                   DENSE=128,
                   DROPOUT=0.5,
                   C1_K=8,
                   C1_S=32,
                   C2_K=16,
                   C2_S=32,
                   activation='relu',
                   loss='binary_crossentropy',
                   optimizer='adadelta',
                   learning_rate=0.01,
                   metrics='accuracy'):
    model = Sequential()
    #Adding a bit of GaussianNoise also works as regularization
    model.add(GaussianNoise(g_noise, input_shape=(input_dim,)))
    #First two is number of filter + kernel size
    model.add(Reshape((input_dim, 1)))
    model.add(Conv1D(C1_K, (C1_S), activation=activation, padding="same"))
    model.add(Conv1D(C2_K, (C2_S), padding="same", activation=activation))
    model.add(Flatten())
    model.add(Dropout(DROPOUT))
    model.add(Dense(DENSE, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    if optimizer=='adadelta':
        opt = Adadelta(lr=learning_rate)
    elif optimizer=='adam':
        opt = Adam(lr=learning_rate)
    elif optimizer=='rsmprop':
        opt = RMSprop(lr=learning_rate)
    else :
        opt = optimizer

    model.compile(loss=loss, optimizer=opt, metrics=metrics)
    model.summary()

    return model

In [13]:
input_dim = train_dataset.X.shape[1]
print(input_dim)
model = KerasModel(make_cnn_model, epochs=30, verbose=1)

205


In [14]:
print(train_dataset.X.shape, train_dataset.y.shape)

model.fit(train_dataset)

(23994, 205) (23994,)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gaussian_noise (GaussianNoi  (None, 205)              0         
 se)                                                             
                                                                 
 reshape (Reshape)           (None, 205, 1)            0         
                                                                 
 conv1d (Conv1D)             (None, 205, 8)            264       
                                                                 
 conv1d_1 (Conv1D)           (None, 205, 16)           4112      
                                                                 
 flatten (Flatten)           (None, 3280)              0         
                                                                 
 dropout_1 (Dropout)         (None, 3280)              0         
                                

2022-01-17 17:57:28.664397: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8201


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [15]:
metrics = [Metric(roc_auc_score),
          Metric(precision_score),
          Metric(accuracy_score),
          Metric(confusion_matrix),
          Metric(classification_report)]

print('training set score:', model.evaluate(train_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

roc_auc_score: 
 0.7317737410371854
precision_score: 
 0.7186123348017621
accuracy_score: 
 0.7317662749020588
confusion_matrix: 
 [[8423 3577]
 [2859 9135]]
classification_report: 
               precision    recall  f1-score   support

           0       0.75      0.70      0.72     12000
           1       0.72      0.76      0.74     11994

    accuracy                           0.73     23994
   macro avg       0.73      0.73      0.73     23994
weighted avg       0.73      0.73      0.73     23994

training set score: {'roc_auc_score': 0.7317737410371854, 'precision_score': 0.7186123348017621, 'accuracy_score': 0.7317662749020588, 'confusion_matrix': 5998.5, 'classification_report': None}
roc_auc_score: 
 0.7171695222611306
precision_score: 
 0.701530612244898
accuracy_score: 
 0.7171792948237059
confusion_matrix: 
 [[2711 1287]
 [ 975 3025]]
classification_report: 
               precision    recall  f1-score   support

           0       0.74      0.68      0.71      3998
     

## LSTM

In [16]:
input_dim = train_dataset.X.shape[1]

def create_lstm_model(optimizer='adam', input_dim=input_dim):
    # create model
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(LSTM(32))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    model.summary()
    return model

In [17]:
input_dim = train_dataset.X.shape[1]
print(input_dim)
model = KerasModel(make_cnn_model, epochs=30, verbose=1)

205


In [18]:
print(train_dataset.X.shape, train_dataset.y.shape)

model.fit(train_dataset)

(23994, 205) (23994,)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gaussian_noise_1 (GaussianN  (None, 205)              0         
 oise)                                                           
                                                                 
 reshape_1 (Reshape)         (None, 205, 1)            0         
                                                                 
 conv1d_2 (Conv1D)           (None, 205, 8)            264       
                                                                 
 conv1d_3 (Conv1D)           (None, 205, 16)           4112      
                                                                 
 flatten_1 (Flatten)         (None, 3280)              0         
                                                                 
 dropout_2 (Dropout)         (None, 3280)              0         
                                

In [19]:
metrics = [Metric(roc_auc_score),
          Metric(precision_score),
          Metric(accuracy_score),
          Metric(confusion_matrix),
          Metric(classification_report)]

print('training set score:', model.evaluate(train_dataset, metrics))
print('test set score:', model.evaluate(test_dataset, metrics))

roc_auc_score: 
 0.734354844088711
precision_score: 
 0.7259568993245417
accuracy_score: 
 0.7343502542302243
confusion_matrix: 
 [[8592 3408]
 [2966 9028]]
classification_report: 
               precision    recall  f1-score   support

           0       0.74      0.72      0.73     12000
           1       0.73      0.75      0.74     11994

    accuracy                           0.73     23994
   macro avg       0.73      0.73      0.73     23994
weighted avg       0.73      0.73      0.73     23994

training set score: {'roc_auc_score': 0.734354844088711, 'precision_score': 0.7259568993245417, 'accuracy_score': 0.7343502542302243, 'confusion_matrix': 5998.5, 'classification_report': None}
roc_auc_score: 
 0.7209231490745373
precision_score: 
 0.7091812588736394
accuracy_score: 
 0.7209302325581395
confusion_matrix: 
 [[2769 1229]
 [1003 2997]]
classification_report: 
               precision    recall  f1-score   support

           0       0.73      0.69      0.71      3998
      