# Demo_DFNN-based_classifier

In this demo, we will construct a Deep feedforward neural network(DFNN)-based classifier for predicting the acticity of caspase-6 inhibitor. Herein, DFNN model with  were carried out by keras.

## Import modules 

In [None]:
## import very basic modules
import os
os.chdir('./')
import numpy as np
import pandas as pd
import sys
sys.path.append('./release/')

In [None]:
# For Keras
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import regularizers
from keras.models import Model, load_model
from keras.layers import Dense, Dropout, Activation

In [None]:
# for DFNN analysis
from utils_DFNN import print_stats, drow_history_acc, drow_history_loss
from utils_DFNN import drow_roc_curve, drow_recall
from utils_DFNN import drow_precision_recal, drow_precision

## Molecular characterization

In [None]:
# for data preprocessing
from predict import PredictorData
from utils_ML import get_desc, get_fp
from mordred import Calculator, descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split

A total of 200 RDkit molecular descriptors were adopted.

In [None]:
calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
descs = [desc_name[0] for desc_name in Descriptors._descList]
#print(len(descs), descs)       # cehck the details of descs

Import modeling set

In [None]:
pred_data = PredictorData(path='./data/Modeling_set.csv', get_features=get_fp)
pred_data.y[pred_data.y== -1] = 0  # replace 0 into -1 (inactive)

In [None]:
x = pred_data.x
y = pred_data.y
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.4, random_state=2768)

##  DFNN Classifiers construction 

Herein, the sequential-based DFNN model was first carried out by keras, which consist of a input layer and four dense (densely-connected) layers combined with three dropout layers. RMSprop algorithm was utilized as the optimizer.

In [None]:
lsize = 128 #,128

In [None]:
model = Sequential()
model.add(Dense(lsize, input_dim=x_train.shape[1],activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.25))  #0.25
model.add(Dense(int(lsize/15), activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.25))  #0.25
model.add(Dense(int(lsize/4),kernel_regularizer=regularizers.l2(0.1), activation='relu'))
model.add(Dropout(rate=0.1))  #0.1
model.add(Dense(1, activation='sigmoid'))
# Compile model (optimizer: SGD, RMSprop, Adadelta, Adagrad, Adam, Adamax, Nadam)
model.compile(loss='binary_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
model.summary()

### Training NFFN model 

In [None]:
checkpointer = ModelCheckpoint(filepath="model_NFFN.h5",
                               verbose=0,
                               save_best_only=True)

reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.2,patience=1, min_lr=0.05)
reduce_val_lr = ReduceLROnPlateau(monitor='val_acc', factor=0.4, patience=1, min_lr=0.05)
es = EarlyStopping(monitor='loss', patience=3, verbose=1, mode='auto')

callbacks_list = [es, reduce_lr, reduce_val_lr, checkpointer]

# Fit the model
history = model.fit(x_train, y_train, epochs=10, batch_size=512,#512, 1026
                    validation_split=0.25, verbose=2,#data=(X_val, y_val)
                    callbacks=callbacks_list)

#### Plotting history_loss

In [None]:
drow_history_loss(history)

# Predicting data

Loadding pre-trained mode (optional)

In [None]:
model = load_model('model_NFFN-RMSprop.h5')

### Performance on training set

In [None]:
predictions = model.predict(x_train)
y_pred_train = (predictions > 0.5)
print_stats(y_train, y_pred_train)

In [None]:
drow_roc_curve(y_train, y_pred_train)

### Performance on validation set

In [None]:
predictions = model.predict(x_val)
y_pred_val = (predictions > 0.5)
print_stats(y_val, y_pred_val)

In [None]:
drow_roc_curve(y_val, y_pred_val)

### Performance on indepentent test set

Data import and descriptors calculation

In [None]:
# Calculating molecular descriptors
calc=MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])

# Import dataset
vali_data = PredictorData(path='./data/Validation_set.csv', get_features=get_fp)

# define x_test and y_test
x_test=vali_data.x
vali_data.y[vali_data.y== -1] = 0
y_test=vali_data.y

NFFN preformance on independent test set

In [None]:
predictions = model.predict(x_test)
y_pred_test = (predictions > 0.5)
print_stats(y_test, y_pred_test)

In [None]:
drow_roc_curve(y_test, y_pred_test)