In [1]:
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections

from tqdm import tqdm
from functions import ANALYSIS
from architecture import ARCHITECTURE
import architecture

from collections import Counter
from IPython.display import Image

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
#from tensorflow.compat.v1.keras.layers import CuDNLSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

import tensorflow.keras.backend as K
import keras

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
import tensorflow as tf

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

EXP = 'CD101T10_RIVAL_CTD'

obj = ANALYSIS(EXP)
arch_obj = ARCHITECTURE()

In [2]:
# #Rxtract CTD from protein descriptors
# def getCTD(C,T,D):
#     a1, a2, a3 = [], [], []
#     for target in tqdm(DTI_index['target']):
#         c = C[C['CHEMBL_ID']==target]
#         t = T[T['CHEMBL_ID']==target]
#         d = D[D['CHEMBL_ID']==target]
#         a1.append(c.values[0])
#         a2.append(t.values[0])
#         a3.append(d.values[0])
#     # Dump to dfs
#     df_C = pd.DataFrame(a1, columns=C.columns)
#     df_C = df_C[df_C.columns.difference(['Unnamed: 0','#','uniprot_ID','CHEMBL_ID'])]
#     df_T = pd.DataFrame(a2, columns=T.columns)
#     df_T = df_T[df_T.columns.difference(['Unnamed: 0','#','uniprot_ID','CHEMBL_ID'])]
#     df_D = pd.DataFrame(a3, columns=D.columns)
#     df_D = df_D[df_D.columns.difference(['Unnamed: 0','#','uniprot_ID','CHEMBL_ID'])]

#     CTD = pd.concat([df_C,df_T,df_D], axis=1)
#     return CTD

# C = pd.read_csv('data/fset_protein/CTDC.csv')
# D = pd.read_csv('data/fset_protein/CTDD.csv')
# T = pd.read_csv('data/fset_protein/CTDT.csv')
# CTD = getCTD(C,T,D)
# CTD.to_csv('data/CTD.csv')

In [3]:
P = pd.read_csv('data/ProtSeqForModel.csv')
D = pd.read_csv('data/DrugSeqForModel.csv')
des_drug = np.load('data/DrugDesForModel.npy')
CTD = pd.read_csv('data/CTD.csv')
CTD = CTD[CTD.columns.difference(['Unnamed: 0'])]

drug_smiles=pd.read_csv('data/drug_smiles.csv')[['drug','smile','seq_char_count']]
DTI_index=pd.read_csv('data/DTI_index.csv')[['target','drug','IC50','unit','activity','target_uniprot']]

In [4]:
# Encode amino acides and smile characters
codes_target = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_dict_target = obj.create_dict(codes_target)

codes_drug = [char for char in ''.join(set(''.join(drug_smiles['smile'].values)))]
char_dict_drug = obj.create_dict(codes_drug)

encode_target = obj.integer_encoding(P, char_dict_target) 
encode_drug = obj.integer_encoding(D, char_dict_drug) 

# padding sequences
max_length_target = 1000
max_length_drug = 150
pad_target = pad_sequences(encode_target, maxlen=max_length_target, padding='post', truncating='post')
pad_drug = pad_sequences(encode_drug, maxlen=max_length_drug, padding='post', truncating='post')

# label/integer encoding output variable: (y)
le = LabelEncoder()
y = le.fit_transform(DTI_index['activity'].tolist())
# One hot encoding of outputs
y = to_categorical(y)

In [5]:
def getModel():
    input_target = Input(shape=(max_length_target,))
    emb_target = Embedding(21, 128, input_length=1000)(input_target) 
    conv_target_1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(emb_target)
    pool_target_1 = MaxPooling1D(pool_size=2)(conv_target_1)
    flatten_target = Flatten()(pool_target_1)

    input_drug = Input(shape=(max_length_drug,))
    emb_drug = Embedding(44, 128, input_length=150)(input_drug) 
    conv_drug_1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(emb_drug)
    pool_drug_1 = MaxPooling1D(pool_size=2)(conv_drug_1)
    flatten_drug = Flatten()(pool_drug_1)

    input_CTD = Input(shape=(273,))
    dense_1_CTD = Dense(512, activation = 'relu',kernel_initializer='glorot_normal')(input_CTD)

    concat = Concatenate()([flatten_target, flatten_drug, dense_1_CTD])

    dense_1 = Dense(512, activation = 'relu',kernel_initializer='glorot_normal')(concat)
    #dense_1_dropout = Dropout(0.1)(dense_1)
    dense_2 = Dense(512, activation = 'relu',kernel_initializer='glorot_normal')(dense_1)
    #dense_3 = Dense(512, activation = 'relu',kernel_initializer='glorot_normal')(dense_2)
    dense_2_dropout = Dropout(0.4)(dense_2)

    x_output = Dense(3, activation='softmax')(dense_2_dropout)

    # Bulid and compile model
    model = Model(inputs=[input_target, input_drug, input_CTD], outputs=x_output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
from sklearn.model_selection import train_test_split
CV = 5
result_CV = {key: None for key in [0,1,2,3,4]}
for i in range(CV):
    # Split indics into train/test
    indices = list(range(len(DTI_index)))
    train_target, test_target, train_drug, test_drug, CTD_train, CTD_test, y_train, y_test = train_test_split(pad_target, pad_drug, CTD, y, test_size=0.2, random_state=i)

    # Early Stopping & checkpoint
    model = getModel()
    es = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
    filepath='checkpoints/'+EXP+'_cp_'+str(i)+'.hdf5'
    model_checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=10, verbose=1, epsilon=1e-4, mode='min')

    history = model.fit(
        [train_target, train_drug, CTD_train], y_train,
        epochs=500, batch_size=256,
        validation_split=0.2,
        callbacks=[es, model_checkpoint]
        )
    
    # Load saved model
    saved_model = tf.keras.models.load_model('checkpoints/'+EXP+'_cp_'+str(i)+'.hdf5')
    # Evaluate the model on the test data using `evaluate`
    print("\nEvaluate on test data")
    res = saved_model.evaluate([test_target, test_drug, CTD_test], y_test, batch_size=256)
    #print("test loss, test acc:", results)

    # Calculate ROC and PR characteristics
    roc_auc, fpr, tpr = obj.roc(saved_model, y_test, a = test_target, b = test_drug, c = CTD_test)
    precision, recall, average_precision = obj.aupr(saved_model, y_test, a = test_target, b = test_drug, c = CTD_test)
    prediction = saved_model.predict([test_target, test_drug, CTD_test])
    f1 = f1_score(np.argmax(y_test,axis=1), np.argmax(prediction,axis=1), average='macro')

    keys = ['val_loss','val_accuracy','roc_auc','fpr','tpr','f1_score', 'precision','recall','average_precision']
    result = {key: None for key in keys}

    result['val_accuracy'] = res[1]
    result['val_loss'] = res[0]
    result['roc_auc'] = roc_auc
    result['fpr'] = fpr
    result['tpr'] = tpr
    result['f1_score'] = f1
    result['precision'] = precision
    result['recall'] = recall
    result['average_precision'] = average_precision

    result_CV[i] = result

    del model
    del saved_model

Epoch 1/500
Epoch 00001: val_loss improved from inf to 0.87332, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 2/500
Epoch 00002: val_loss improved from 0.87332 to 0.77278, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 3/500
Epoch 00003: val_loss improved from 0.77278 to 0.70209, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 4/500
Epoch 00004: val_loss improved from 0.70209 to 0.64429, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 5/500
Epoch 00005: val_loss improved from 0.64429 to 0.62707, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 6/500
Epoch 00006: val_loss improved from 0.62707 to 0.62039, saving model to checkpoints\CD101T10_RIVAL_CTD_cp_0.hdf5
Epoch 7/500
Epoch 00007: val_loss did not improve from 0.62039
Epoch 8/500
Epoch 00008: val_loss did not improve from 0.62039
Epoch 9/500
Epoch 00009: val_loss did not improve from 0.62039
Epoch 10/500
Epoch 00010: val_loss did not improve from 0.62039
E

In [9]:
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [10]:
save_obj(result_CV, EXP)