In [1]:
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from tqdm import tqdm
from functions import MODEL

from collections import Counter
from prettytable import PrettyTable
from IPython.display import Image

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation, Concatenate, Layer
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from keras.layers import Embedding, Bidirectional, GlobalMaxPooling1D, LSTM
import keras.backend as K

obj = MODEL()

In [2]:
DTI_index=pd.read_csv('data/DTI_index.csv')[['target','drug','IC50','unit','activity','target_uniprot']]
target_seq=pd.read_csv('data/target_seq.csv')[['target_uniprot','target_chembl','seq']]
drug_smiles=pd.read_csv('data/drug_smiles.csv')[['drug','smile','seq_char_count']]

In [3]:
DTI_index.shape

(61624, 6)

In [3]:
# Split indics into train/test
train_indices, test_indices = obj.split(DTI_index, 8)

In [4]:
# Train and test data
train_target = DTI_index.loc[train_indices][['target_uniprot']]
train_drug = DTI_index.loc[train_indices][['drug']]
test_target = DTI_index.loc[test_indices][['target_uniprot']]
test_drug = DTI_index.loc[test_indices][['drug']]

# Labels
train_y = DTI_index.loc[train_indices][['activity']]
test_y = DTI_index.loc[test_indices][['activity']]

print(train_target.shape, train_drug.shape, test_target.shape, test_drug.shape, train_y.shape, test_y.shape)

(61427, 1) (61427, 1) (15357, 1) (15357, 1) (61427, 1) (15357, 1)


In [7]:
def addSeq():
    # Add sequence to corresponding target IDs
    seq_target = []
    for target in tqdm(train_target['target_uniprot']):
        try:
            seq_target.append(target_seq[target_seq['target_uniprot']==target]['seq'].values[0])
        except:
            print(target)
    train_target['seq'] = seq_target

    # Add smile strings to corresponding drug IDs
    seq_drug = []
    for drug in tqdm(train_drug['drug']):
        try:
            seq_drug.append(drug_smiles[drug_smiles['drug']==drug]['smile'].values[0])
        except:
            print(target)
    train_drug['seq'] = seq_drug

100%|██████████| 61427/61427 [00:35<00:00, 1729.82it/s]


In [8]:
# plt.subplot(1, 1, 1)
# obj.plot_seq_count(drug_smiles, 'Train')

# code_freq = obj.get_code_freq(target_seq['seq'], 'Train')
# plt.subplot(1, 1, 1)
# obj.plot_code_freq(code_freq, 'Train')

In [9]:
# Encode amino acides and smile characters
codes_target = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
char_dict_target = obj.create_dict(codes_target)

codes_drug = [char for char in ''.join(set(''.join(drug_smiles['smile'].values)))]
char_dict_drug = obj.create_dict(codes_drug)

print(char_dict_target)
print("Target Dict Length:", len(char_dict_target))

print(char_dict_drug)
print("Drug dict Length:", len(char_dict_drug))

{'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}
Target Dict Length: 20
{'[': 1, 'l': 2, 'I': 3, 'K': 4, 'O': 5, 'B': 6, '6': 7, 'F': 8, ')': 9, '7': 10, '3': 11, 'a': 12, 'P': 13, 'c': 14, ']': 15, '/': 16, '(': 17, 'S': 18, '1': 19, 'e': 20, 'i': 21, 'Z': 22, '8': 23, 'H': 24, '+': 25, 'r': 26, '4': 27, 'L': 28, '5': 29, 'N': 30, 'A': 31, '.': 32, '9': 33, 'C': 34, '@': 35, '2': 36, 'o': 37, '#': 38, 'n': 39, '\\': 40, '-': 41, 's': 42, '=': 43}
Drug dict Length: 43


In [10]:
train_encode_target = obj.integer_encoding(train_target, char_dict_target) 
train_encode_drug = obj.integer_encoding(train_drug, char_dict_drug) 

In [11]:
# padding sequences
max_length = 1000
train_pad_target = pad_sequences(train_encode_target, maxlen=max_length, padding='post', truncating='post')
train_pad_drug = pad_sequences(train_encode_drug, maxlen=max_length, padding='post', truncating='post')
train_pad_target.shape, train_pad_drug.shape

((61427, 1000), (61427, 1000))

In [12]:
# One hot encoding of sequences
train_ohe_target = to_categorical(train_pad_target)
train_ohe_drug = to_categorical(train_pad_drug)
train_ohe_target.shape, train_ohe_drug.shape

((61427, 1000, 21), (61427, 1000, 44))

In [13]:
# label/integer encoding output variable: (y)
le = LabelEncoder()
y_train_le = le.fit_transform(train_y['activity'].tolist())
y_train_le.shape

(61427,)

In [14]:
# One hot encoding of outputs
y_train = to_categorical(y_train_le)
y_train.shape

(61427, 3)

In [15]:
# Attention class
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

In [32]:
# Model Architecture
input_target = Input(shape=(1000,))
emb_target = Embedding(21, 128, input_length=max_length)(input_target) 
conv_target_1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(emb_target)
pool_target_1 = MaxPooling1D(pool_size=2)(conv_target_1)
att_in_target = Bidirectional(LSTM(32, kernel_regularizer=l2(0.01), return_sequences=True, recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(pool_target_1)
#att_out_target = attention()(att_in_target)
flat_1_target = Flatten()(att_in_target)

# softmax classifier
#x_output_target = Dense(3, activation='softmax')(att_in_target)

input_drug = Input(shape=(1000,))
emb_drug = Embedding(44, 128, input_length=max_length)(input_drug) 
conv_drug_1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(emb_drug)
pool_drug_1 = MaxPooling1D(pool_size=2)(conv_drug_1)
att_in_drug = Bidirectional(LSTM(32, kernel_regularizer=l2(0.01), return_sequences=True, recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(pool_drug_1)
#att_out_drug = attention()(att_in_drug)
flat_1_drug = Flatten()(att_in_drug)

concat = Concatenate()([flat_1_target,flat_1_drug])

dense_1 = Dense(1024, activation = 'relu',kernel_initializer='glorot_normal')(concat)
dense_2 = Dense(512, activation = 'relu',kernel_initializer='glorot_normal')(dense_1)

# softmax classifier
x_output = Dense(3, activation='softmax')(dense_2)

model1 = Model(inputs=[input_target, input_drug], outputs=x_output)
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model1.summary()

Model: "functional_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 1000, 128)    2688        input_11[0][0]                   
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 1000, 128)    5632        input_12[0][0]                   
______________________________________________________________________________________

In [55]:
from keras.utils import plot_model
plot_model(model1, to_file='model.png')

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [54]:
# Early Stopping
es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
histories = []

In [34]:
histories.append(model1.fit(
    [train_pad_target, train_pad_drug], y_train,
    epochs=100, batch_size=128,
    validation_split=0.2,
    callbacks=[es]
    ))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

ResourceExhaustedError:  OOM when allocating tensor with shape[64000,1024] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[node Adam/Adam/update_18/ResourceApplyAdam (defined at <ipython-input-34-c11ed5ffe4d8>:5) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_62068]

Function call stack:
train_function


In [35]:
# Plot model history
obj.plot_history(histories[0])

IndexError: list index out of range

import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
from tqdm import tqdm
from functions import MODEL

from collections import Counter
from prettytable import PrettyTable
from IPython.display import Image

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.regularizers import l2
from keras.constraints import max_norm
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation, Concatenate, Layer
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization
from keras.layers import Embedding, Bidirectional, GlobalMaxPooling1D, LSTM
import keras.backend as K

obj = MODEL()
# Import and process level 2 data
drug_smiles = p