## AM 216 Final Project
Lihong Zhang, Litao Yan, Ruoxi Yang

We were inspired by the drug-protein affinity project in the mini projects, and do the final project in exploring the representations and models of drugs and proteins. 

In [19]:
import numpy as np
import rdkit
from rdkit.Chem import Draw
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import MolFromSmiles

import tensorflow as tf
from keras import Sequential, Model
from keras.layers import Dense, concatenate, Dropout
from tensorflow.keras.models import Model
from keras import Sequential, Model
from keras.layers import Conv1D, MaxPooling1D, Flatten, Conv2D, MaxPooling2D

import pandas as pd
import matplotlib.pyplot as plt
import json
import pickle
from collections import OrderedDict
import networkx as nx

from deepchem.metrics import to_one_hot
from deepchem.feat.mol_graphs import ConvMol

import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
import deepchem as dc

Prepare train data and test data

In [4]:
LOCAL_KIBA_PATH = '../data/mini_project_data/data/kiba/'
LOCAL_DAVIS_PATH = '../data/GraphDTA_davis/'
G_PATH = './drive/MyDrive/Colab Notebooks/Drug Binding'
max_seq_len = 1000
# for converting protein sequence to categorical format/numerical format
seq_voc = "ABCDEFGHIKLMNOPQRSTUVWXYZ"
seq_dict = {v:i for i,v in enumerate(seq_voc)}
seq_dict_len = len(seq_dict)


In [5]:
np.concatenate((np.array([[1, 2], [3, 4]]).reshape(-1, 2, 1), np.array([[5, 6], [7, 8]]).reshape(-1, 2, 1)), axis=2).shape

(2, 2, 2)

In [51]:
# !! 13 2x2 matrices, nodes + adj together

def adj2matr(raw_a):
    len_a = len(raw_a)
    matr = np.zeros((len_a, len_a))
    for i in range(len_a):
        for j in raw_a[i]:
            matr[i, j] = 1
    return matr

# convmol to matrices
def conv2matr(convmol):
    nr, nc = max([convmol[i].get_atom_features().shape[0] for i in range(len(convmol))]), convmol[0].get_atom_features().shape[1]
    amax = max([len(convmol[i].get_adjacency_list()) for i in range(len(convmol))])
    ar, ac = amax, amax
    nodes = []
    adj = []
    for i in range(len(convmol)):
        temp_n = np.zeros((nr, nc))
        n = convmol[i].get_atom_features()
        temp_n[:n.shape[0],:n.shape[1]] = n
        nodes.append(temp_n)
        nodes.append(temp_n.reshape((temp_n.shape[0], temp_n.shape[1], 1)))
        temp_a = np.zeros((ar, ac))
        a = adj2matr(convmol[i].get_adjacency_list()) 
        temp_a[:a.shape[0],:a.shape[1]] = a
        adj.append(temp_a)
        adj.append(temp_a.reshape((temp_a.shape[0], temp_a.shape[1], 1)))
    # return [np.concatenate(nodes, axis = 2), np.concatenate(adj, axis = 2)]
    return [nodes, adj]



# The codes below in this cell are from section 10 of AM216 and mini project
def seq_to_cat(prot):  # prot: protein
    x = np.zeros(max_seq_len)
    for i, ch in enumerate(prot[:max_seq_len]): 
        x[i] = seq_dict[ch]
    return x  

def read_data(data_path):
    drugs_ = json.load(open(data_path + "ligands_can.txt"), object_pairs_hook=OrderedDict)
    
    smiles = np.array([Chem.MolToSmiles(Chem.MolFromSmiles(d),isomericSmiles=True) for d in drugs_.values()])
    featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
    convmol = featurizer.featurize(smiles)

    # inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
    # for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
      # inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
      
    # drugs = conv2matr(convmol)

    proteins_ = json.load(open(data_path + "proteins.txt"), object_pairs_hook=OrderedDict)
    proteins = np.array(list(proteins_.values()))
    affinity = np.array(pickle.load(open(data_path + "Y","rb"), encoding='latin1'))
    train_fold = json.load(open(data_path + "folds/train_fold_setting1.txt"))
    train_fold = [ee for e in train_fold for ee in e ]    
    test_fold = json.load(open(data_path + "folds/test_fold_setting1.txt"))

    # Prepare train/test data with fold indices
    rows, cols = np.where(np.isnan(affinity)==False) 
    convmol_tr = convmol[rows[train_fold]]    # (98545,)
    smiles_tr = smiles[rows[train_fold]] 
    proteins_tr = np.array([seq_to_cat(p) for p in proteins[cols[train_fold]]])   # (98545, 1000)
    affinity_tr = affinity[rows[train_fold], cols[train_fold]]  # (98545,)

    convmol_ts = convmol[rows[test_fold]] # (19709,)
    smiles_ts = smiles[rows[test_fold]] # (19709,)
    proteins_ts = np.array([seq_to_cat(p) for p in proteins[cols[test_fold]]]) # (19709, 1000)
    affinity_ts = affinity[rows[test_fold], cols[test_fold]]    # (19709,)
    '''
    print('Example of drug:{}'.format(drugs_tr[0]))
    print('Example of protein:{} ...'.format(proteins_tr[0][:10]))
    print('Example of affinity score:{}'.format(affinity_tr[0]))
    '''
    return convmol_tr, smiles_tr, proteins_tr, affinity_tr, convmol_ts, smiles_ts, proteins_ts, affinity_ts

def smiles_graph(path):
    drugs_ = json.load(open(path + 'ligands_can.txt'), object_pairs_hook=OrderedDict)
    # print('\nOriginal molecule:')
    mols = MolFromSmiles(smiles[0])
    # Draw.MolToImage(mols)
    featurizer=dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
    graph_data = featurizer.featurize(smiles)
    return smiles, graph_data

In [35]:
nr, nc = max([convmol_tr_kiba[i].get_atom_features().shape[0] for i in range(len(convmol_tr_kiba))]), convmol_tr_kiba[0].get_atom_features().shape[1]

Convert SMILES to Graphs

In [9]:
convmol_tr_kiba, smiles_tr_kiba, proteins_tr_kiba, affinity_tr_kiba, convmol_ts_kiba, smiles_ts_kiba, proteins_ts_kiba, affinity_ts_kiba = read_data(LOCAL_KIBA_PATH)
convmol_tr_davis, smiles_tr_davis, proteins_tr_davis, affinity_tr_davis, convmol_ts_davis, smiles_ts_davis, proteins_ts_davis, affinity_ts_davis = read_data(LOCAL_DAVIS_PATH)

In [31]:
len(convmol_tr_kiba[2].get_adjacency_list())

19

In [50]:
# get the nodes and adjacency matrix of drugs
nodes_tr_kiba, adj_tr_kiba = conv2matr(convmol_tr_kiba)
nodes_ts_kiba, adj_ts_kiba = conv2matr(convmol_ts_kiba)

nodes_tr_davis, adj_tr_davis = conv2matr(convmol_tr_davis)
nodes_ts_davis, adj_ts_davis = conv2matr(convmol_ts_davis)

AxisError: axis 2 is out of bounds for array of dimension 2

In [47]:
np.savetxt("nodes_tr_kiba.txt", nodes_tr_kiba)
np.savetxt("adj_tr_kiba.txt", adj_tr_kiba)
np.savetxt("nodes_ts_kiba.txt", nodes_ts_kiba)
np.savetxt("adj_ts_kiba.txt", adj_ts_kiba)

np.savetxt("nodes_tr_davis.txt", nodes_tr_davis)
np.savetxt("adj_tr_davis.txt", adj_tr_davis)
np.savetxt("nodes_ts_davis.txt", nodes_ts_davis)
np.savetxt("adj_ts_davis.txt", adj_ts_davis)



ValueError: Expected 1D or 2D array, got 3D array instead

In [48]:
print(nodes_ts_kiba.shape, nodes_ts_kiba[0].shape, adj_ts_kiba.shape, nodes_ts_davis.shape, adj_ts_davis.shape)

(268, 75, 19709) (75, 19709) (268, 268, 19709) (46, 75, 5010) (46, 46, 5010)


Prepare data for CNN

In [12]:
# tr_size, drug_size = drugs_ecfp_tr.shape[0], drugs_ecfp_tr.shape[1]
num_ts_davis = proteins_ts_davis.shape[0]

# num_train, num_drugs = drugs_ecfp_tr.shape
num_tr_davis, num_prot_davis = proteins_tr_davis.shape
# drugs_tr_reshape = drugs_ecfp_tr.reshape((num_train, num_drugs, 1))
proteins_tr_davis_reshape = proteins_tr_davis.reshape((num_tr_davis, num_prot_davis, 1))

# Testing data
# drug_ts_reshape = drugs_ecfp_ts.reshape((drugs_ecfp_ts.shape[0], drugs_ecfp_ts.shape[1], 1))
# proteins_ts_reshape = proteins_ts.reshape((proteins_ts.shape[0], proteins_ts.shape[1], 1))

In [21]:
print(affinity_tr_davis.shape, proteins_tr_davis_reshape.shape)

(25046,) (25046, 1000, 1)


Prepare data for GNN

In [None]:
train_dataset = dc.data.NumpyDataset([convmol_tr_davis, proteins_tr_davis_reshape], affinity_tr_davis)

In [20]:
import warnings
warnings.filterwarnings("ignore")

# This finds the number of tasks (ie things we are trying to predict) and then runs
# a regression model for each of those tasks (in our case just 1)
# n_tasks = len(tasks)
gnn = dc.models.GraphConvModel(1, mode='regression')
# gnn = Dense(16, activation= 'linear')(gnn.)
def sub_cnn(input_dim):
    cnn = Sequential() # Create sequential model
    cnn.add(Conv1D(16, 3, activation='relu', input_shape=(input_dim, 1)))
    cnn.add(MaxPooling1D(3))
    cnn.add(Flatten())
    cnn.add(Dropout(0.1))
    cnn.add(Dense(16, activation = 'linear'))
    return cnn

prot_cnn = sub_cnn(num_prot_davis)
cnn_concat = concatenate([gnn.output, prot_cnn.output])

cnn_concat = Dense(1024, activation='relu')(cnn_concat)
cnn_concat = Dropout(0.1)(cnn_concat)
cnn_concat = Dense(16, activation='relu')(cnn_concat)

cnn_concat = Dense(1, activation='linear')(cnn_concat)

cnn = Model(inputs=[gnn.input, prot_cnn.input], outputs=cnn_concat)
# Show model summary
cnn.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
cnn.summary()
# model.fit(train_dataset, nb_epoch=50)  # Starting Keras 2.0, nb_epoch argument has been renamed to epochs everywhere.

AttributeError: 'GraphConvModel' object has no attribute 'output'

In [None]:
# CNN for protein
Feat_input = Input(shape=(num_prot_davis,1))
Feat_layer = Conv1D(16, 3, activation='relu', input_shape=(input_dim, 1))(Feat_input)
Feat_layer = MaxPooling1D(3)(Feat_layer)
Feat_layer = Flatten()(Feat_layer)
Feat_layer = Dropout(0.1)(Feat_layer)
Feat_layer = Dense(16, activation = 'linear')(Feat_layer)


X_in = Input(shape=(1375, 3))
A_in = Input(tensor=sp_matrix_to_sp_tensor(adj_mat))

# GNN for 
graph_conv = GraphConvSkip(64, activation='relu',kernel_regularizer=l2(l2_reg),name='graph_input')([X_in, A_in])
graph_conv = Dropout(0.5)(graph_conv)

graph_conv = ChebConv(32, activation='relu', kernel_regularizer=l2(l2_reg)([graph_conv,A_in])

graph_conv = Dropout(0.5)(graph_conv)

graph_conv = GraphConvSkip(64, activation='relu', kernel_regularizer=l2(l2_reg)([graph_conv,A_in])
graph_conv = Dropout(0.5)(graph_conv)
graph_conv = ChebConv(32, activation='relu', kernel_regularizer=l2(l2_reg))([graph_conv, A_in])

flatten = Flatten()(graph_conv)

concatenated = concatenate([flatten, Feat_layer])

fc = Dense(512, activation='relu')(concatenated)
fc = Dense(256, activation='relu')(FC)
output = Dense(n_out, activation='softmax')(FC)

model = Model(inputs={'graph_input':[X_in, A_in], 'lstm_input':Feat_input}, outputs=output)

optimizer = RMSprop(lr=learning_rate)

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['acc'])
model.summary()
history = model.fit({'graph_input': [X_train], 'lstm_input': x_train_feat }, y_train, batch_size=28, epochs=250,steps_per_epoch=10)

Build a 2D CNN based on graph DAVIS data




In [9]:


nodes = nodes_ts_davis[0]
adj = adj_ts_davis[0]

def cnn1d(input_dim):
    cnn = Sequential() # Create sequential model
    cnn.add(Conv1D(16, 3, activation='relu', input_shape=(input_dim, 1)))
    cnn.add(MaxPooling1D(3))
    cnn.add(Flatten())
    cnn.add(Dropout(0.1))
    cnn.add(Dense(16, activation = 'linear'))
    return cnn

def cnn2d_nodes(nodes):
    cnn = Sequential() # Create sequential model
    cnn.add(Conv2D(16, 3, activation='relu', input_shape=nodes.shape))
    cnn.add(MaxPooling2D(3))
    cnn.add(Flatten())
    cnn.add(Dropout(0.1))
    cnn.add(Dense(16, activation = 'linear'))
    return cnn

def cnn2d_adj(adj):
    cnn = Sequential() # Create sequential model
    cnn.add(Conv2D(16, 3, activation='relu', input_shape=adj.shape))
    cnn.add(MaxPooling2D(3))
    cnn.add(Flatten())
    cnn.add(Dropout(0.1))
    cnn.add(Dense(16, activation = 'linear'))
    return cnn

nodes_cnn = cnn2d_nodes(nodes)
adj_cnn = cnn2d_adj(adj)
prot_cnn = cnn1d(input_dim)
cnn_concat = concatenate([nodes_cnn.output, adj_cnn.output, prot_cnn.output])

final_concat = Model(inputs=[[nodes_cnn.input, adj_cnn.input], prot_cnn.input], outputs=cnn_concat)


final_concat = Dense(1024, activation='relu')(final_concat)
final_concat = Dropout(0.1)(cnn_concat)
final_concat = Dense(16, activation='relu')(final_concat)

final = Dense(1, activation='linear')(final_concat)


# Show model summary
final.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error'])
final.summary()

NameError: name 'num_drugs' is not defined

In [None]:
# Convert SMILES to graphs
smiles_kiba, convmol_kiba = smiles_graph(LOCAL_KIBA_PATH)
smiles_davis, convmol_davis = smiles_graph(LOCAL_DAVIS_PATH)

Convert graphs to 2D matrix of 1s and 0s

Train a CNN model on DAVIS Data

In [2]:
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import Adam
from keras import Sequential, Model
from keras.layers import Conv1D, MaxPooling1D, Flatten

Prepare graph data for CNN

In [None]:
num_train, num_drugs = drugs_ecfp_tr.shape
num_prot = proteins_tr.shape[1]
drugs_tr_reshape = drugs_ecfp_tr.reshape((num_train, num_drugs, 1))
proteins_tr_reshape = proteins_tr.reshape((num_train, num_prot, 1))

# Testing data
drug_ts_reshape = drugs_ecfp_ts.reshape((drugs_ecfp_ts.shape[0], drugs_ecfp_ts.shape[1], 1))
proteins_ts_reshape = proteins_ts.reshape((proteins_ts.shape[0], proteins_ts.shape[1], 1))