In [1]:
###Loading packages
import numpy as np
import pandas as pd
import math
import itertools
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import optimizers
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.utils import class_weight

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score,cohen_kappa_score

from keras import backend as K
import tensorflow as tf

from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


Using TensorFlow backend.


In [2]:
def create_model(optimizer, activation):
    ###define model
    model = Sequential()
    model.add(Dense(512, activation=activation,input_shape=(978,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(256, activation=activation))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(128, activation=activation))
    model.add(BatchNormalization())
    model.add(Dense(64, activation=activation))
    model.add(Dense(32, activation=activation))
    model.add(Dense(16, activation=activation))
    model.add(Dense(8, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    
    ###compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy',monitor_f])
    
    return model


###fit model
def fit_model(X_train, y_train, X_test, y_test,n,model_path,model):
    ###balanced class weight
    class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train),y_train)
    ###define checkpoint for the best model
    checkpoint = ModelCheckpoint(model_path, verbose=1, monitor='val_monitor_f',save_best_only=True, mode='max')
    ###fit model
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=n, class_weight=class_weights,callbacks=[checkpoint])


###predict the independent validation set
def predict_validation(X, y, model_path, model, optimizer):
    model.load_weights(model_path)
    model.compile(loss='binary_crossentropy',optimizer=optimizer)
    y_pred_class = model.predict_classes(X)
    y_pred = model.predict(X)
    result = measurements(y, y_pred_class, y_pred)
    return result


In [3]:
def measurements(y_test, y_pred, y_pred_prob):  
    acc = metrics.accuracy_score(y_test, y_pred)
    sensitivity = metrics.recall_score(y_test, y_pred)
    TN, FP, FN, TP = confusion_matrix(y_test, y_pred).ravel()
    specificity = TN/(TN+FP)
    mcc = metrics.matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_prob)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    ba = (sensitivity + specificity)/2
    return [auc, mcc, f1, kappa, acc, ba, sensitivity, specificity]

In [4]:
def evaluation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    TP = K.sum(y_pos * y_pred_pos)
    TN = K.sum(y_neg * y_pred_neg)

    FP = K.sum(y_neg * y_pred_pos)
    FN = K.sum(y_pos * y_pred_neg)
    return TP, TN, FP, FN

In [5]:
###monitor function for finding the best models
def monitor_f(y_true, y_pred):
    TP, TN, FP, FN = evaluation(y_true, y_pred)
    specificity = TN/(TN+FP)
    mcc = ((TP*TN)-(FP*FN))/(K.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) + K.epsilon())
    return specificity* (mcc + 0.05)

In [6]:
def print_result(model_name, purpose, result, nth):
    print('\033[1mOptimized {} model {} performance for {}: \033[0m'.format(model_name, purpose, nth))
    print("AUC:         {0:.3f}".format(result[0]))
    print("MCC:         {0:.3f}".format(result[1]))
    print("F1:          {0:.3f}".format(result[2]))
    print("Kappa:       {0:.3f}".format(result[3]))
    print("Accuracy:    {0:.3f}".format(result[4]))
    print("BA:          {0:.3f}".format(result[5]))
    print("Sensitivity: {0:.3f}".format(result[6]))
    print("Specificity: {0:.3f}".format(result[7]))

### Import dataset

In [7]:
### Import the pickle dataset: drug_split_index.pickle
df = pd.read_pickle(r'''C:\Users\Ting.Li\OneDrive - FDA\Documents\2019\projects\L1000\revision\org_data\drug_split_index.pickle''')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,sig_id,CompoundName,DILIst.1,cell_id,pert_idose,pert_time_y,model_0,model_1,model_2,...,9738,6793,7358,58472,50865,23200,51293,10962,10153,874
0,0,DOS039_A549_24H:BRD-K81418486:0.1,VORINOSTAT,0,A549,100 nM,24,Test,Test,Test,...,0.62255,1.0562,-0.14135,0.7882,-0.11335,2.35545,0.2334,-0.594,-1.06115,-1.31655
1,1,CPC019_HT29_6H:BRD-K81418486:10,VORINOSTAT,0,HT29,10 ÂµM,6,Test,Test,Test,...,4.4186,-1.834802,-0.223164,-0.255458,5.733514,0.381488,-0.328168,1.290495,0.04982,0.984403
2,2,CPC006_NCIH1694_6H:BRD-K81418486:10,VORINOSTAT,0,NCIH1694,10 ÂµM,6,Test,Test,Test,...,-0.1636,0.3601,1.28305,-0.25085,0.5431,1.08195,-1.38025,-3.2754,-0.53095,-0.97735
3,3,CPC002_HA1E_24H:BRD-A76528577-065-01-2:10,VINCRISTINE,1,HA1E,10 ÂµM,24,Training,Training,Training,...,-4.1861,3.1962,-5.17785,0.70785,-0.4577,1.12585,-5.50515,-1.54205,4.93105,-2.72
4,4,CPC011_VCAP_6H:BRD-K55696337-003-16-0:10,TOPOTECAN,0,VCAP,10 ÂµM,6,Training,Training,Training,...,-0.4546,5.7552,-0.1787,0.8583,0.7385,0.7271,-1.6301,0.6459,-1.4513,-2.5655


### Optimized 50 models

In [9]:
models = ['model_0', 'model_1', 'model_2', 'model_3', 'model_4', 'model_5',
       'model_6', 'model_7', 'model_8', 'model_9', 'model_10', 'model_11',
       'model_12', 'model_13', 'model_14', 'model_15', 'model_16',
       'model_17', 'model_18', 'model_19', 'model_20', 'model_21',
       'model_22', 'model_23', 'model_24', 'model_25', 'model_26',
       'model_27', 'model_28', 'model_29', 'model_30', 'model_31',
       'model_32', 'model_33', 'model_34', 'model_35', 'model_36',
       'model_37', 'model_38', 'model_39', 'model_40', 'model_41',
       'model_42', 'model_43', 'model_44', 'model_45', 'model_46',
       'model_47', 'model_48', 'model_49']

In [10]:
### Create the model
model = create_model('Adam', 'elu')
for var in models:
    ### Get the test dataset
    test = df[df[var] == 'Test']     
    ### Load the optimized model weights
    # base_path is the path for the 50 DNN models with drug based splitting methods.
    base_path = r'C:\Users\Ting.Li\OneDrive - FDA\Documents\2019\projects\L1000\revision\model_weights_50'
    model_path = base_path + '\\'+ var + '_weights.h5'
    print_result('DNN', 'testing', predict_validation(test.iloc[:, -978:].values, test.loc[:,'DILIst.1'].values, model_path, model, 'Adam'), var)

[1mOptimized DNN model testing performance for model_0: [0m
AUC:         0.834
MCC:         0.504
F1:          0.654
Kappa:       0.423
Accuracy:    0.690
BA:          0.770
Sensitivity: 0.971
Specificity: 0.568
[1mOptimized DNN model testing performance for model_1: [0m
AUC:         0.829
MCC:         0.522
F1:          0.741
Kappa:       0.497
Accuracy:    0.742
BA:          0.761
Sensitivity: 0.879
Specificity: 0.644
[1mOptimized DNN model testing performance for model_2: [0m
AUC:         0.824
MCC:         0.501
F1:          0.660
Kappa:       0.466
Accuracy:    0.738
BA:          0.774
Sensitivity: 0.862
Specificity: 0.687
[1mOptimized DNN model testing performance for model_3: [0m
AUC:         0.816
MCC:         0.477
F1:          0.689
Kappa:       0.465
Accuracy:    0.735
BA:          0.747
Sensitivity: 0.791
Specificity: 0.702
[1mOptimized DNN model testing performance for model_4: [0m
AUC:         0.810
MCC:         0.492
F1:          0.683
Kappa:       0.452
Accura

[1mOptimized DNN model testing performance for model_39: [0m
AUC:         0.742
MCC:         0.349
F1:          0.592
Kappa:       0.343
Accuracy:    0.686
BA:          0.683
Sensitivity: 0.672
Specificity: 0.694
[1mOptimized DNN model testing performance for model_40: [0m
AUC:         0.742
MCC:         0.385
F1:          0.616
Kappa:       0.360
Accuracy:    0.678
BA:          0.705
Sensitivity: 0.782
Specificity: 0.627
[1mOptimized DNN model testing performance for model_41: [0m
AUC:         0.740
MCC:         0.408
F1:          0.665
Kappa:       0.386
Accuracy:    0.686
BA:          0.708
Sensitivity: 0.807
Specificity: 0.609
[1mOptimized DNN model testing performance for model_42: [0m
AUC:         0.738
MCC:         0.360
F1:          0.629
Kappa:       0.338
Accuracy:    0.661
BA:          0.686
Sensitivity: 0.778
Specificity: 0.593
[1mOptimized DNN model testing performance for model_43: [0m
AUC:         0.736
MCC:         0.383
F1:          0.616
Kappa:       0.346
A