In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import math
import h5py
import pickle
import json
from tensorflow.keras.models import Model
from collections import OrderedDict
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import os
import xgboost as xgb
import random as rn
os.environ['PYTHONHASHSEED'] = '0'
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  #指定要使用的GPU序号

np.random.seed(1)
rn.seed(1)

session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
session_conf.gpu_options.allow_growth=True   	  #不全部占满显存, 动态增长
from tensorflow.keras import backend as K
tf.set_random_seed(0)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

In [2]:
def cindex_score(y_true, y_pred):

    g = tf.subtract(tf.expand_dims(y_pred, -1), y_pred)
    g = tf.cast(g == 0.0, tf.float32) * 0.5 + tf.cast(g > 0.0, tf.float32)

    f = tf.subtract(tf.expand_dims(y_true, -1), y_true) > 0.0
    f = tf.matrix_band_part(tf.cast(f, tf.float32), -1, 0)

    g = tf.reduce_sum(tf.multiply(g, f))
    f = tf.reduce_sum(f)

    return tf.where(tf.equal(g, 0), 0.0, g/f) #select


In [3]:
def prepare_interaction_pairs(XD, XD_STRU, XT, XT_STRU, Y, rows, cols):
    drugs = []
    drugs_stru = []
    targets = []
    targets_stru = []
    affinity=[] 
        
    for pair_ind in range(len(rows)):
        drug = XD[rows[pair_ind]]
        drugs.append(drug)

        drug_stru = XD_STRU[rows[pair_ind]]
        drugs_stru.append(drug_stru)

        target=XT[cols[pair_ind]]
        targets.append(target)

        target_stru = XT_STRU[cols[pair_ind]]
        targets_stru.append(target_stru)

        affinity.append(Y[rows[pair_ind],cols[pair_ind]])

    drug_data = np.stack(drugs)
    drug_stru_data = np.stack(drugs_stru)
    target_data = np.stack(targets)
    target_stru_data = np.stack(targets_stru)

    return drug_data, drug_stru_data, target_data, target_stru_data, affinity

In [4]:
def generate_h5File_test(XD, XD_STRU, XT, XT_STRU, Y):
    label_row_inds, label_col_inds = np.where(np.isnan(Y)==False)
    model = keras.models.load_model(f'3_1_model.h5', custom_objects={'cindex_score': cindex_score})
    new_model = Model(inputs=model.input,outputs=model.get_layer('DenseFeature').output)

    
    test_sets = json.load(open("data/davis/folds/test_fold_setting1.txt"))
    train_sets = json.load(open("train_fold_setting_full.txt"))
    
    terows = label_row_inds[test_sets]
    tecols = label_col_inds[test_sets]
    test_drugs, test_drugs_stru, test_prots, test_prots_stru, test_Y = prepare_interaction_pairs(XD, XD_STRU, XT, XT_STRU, Y, terows, tecols)
    
    trrows = label_row_inds[train_sets]
    trcols = label_col_inds[train_sets]
    train_drugs, train_drugs_stru, train_prots, train_prots_stru, train_Y = prepare_interaction_pairs(XD, XD_STRU, XT, XT_STRU, Y, trrows, trcols)
    
    #f = h5py.File("DenseFeature.h5","w")
    #NNFeature = new_model.predict([np.array(train_drugs),np.array(train_drugs_stru),np.array(train_prots),np.array(train_prots_stru)])
    #print(len(train_Y))
    #TeNNFeature = new_model.predict([np.array(test_drugs), np.array(test_drugs_stru), np.array(test_prots),np.array(test_prots_stru)])
    #print(NNFeature.shape, TeNNFeature.shape)
    #print(len(test_Y))
    #d1 = f.create_dataset("train_feature",(NNFeature.shape[0], 512),'f', np.array(NNFeature))
    #d2 = f.create_dataset("test_feature",(TeNNFeature.shape[0], 512),'f', np.array(TeNNFeature))
    #f.close()
    with open('test_Y.txt','w') as file_object:
        json.dump(test_Y,file_object)
    with open('train_Y.txt','w') as file_object:
        json.dump(train_Y,file_object)
    return 'SAVE OK'

In [5]:
CHARPROTSET = { "A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6, 
				"F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12, 
				"O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18, 
				"U": 19, "T": 20, "W": 21, 
				"V": 22, "Y": 23, "X": 24, 
				"Z": 25 }

CHARPROTLEN = 25

CHARPROTSTRUSET = {"C": 1,"H": 2,"E": 3}

CHARPROTSTRULEN =3

CHARCANSMISET = { "#": 1, "%": 2, ")": 3, "(": 4, "+": 5, "-": 6, 
			 ".": 7, "1": 8, "0": 9, "3": 10, "2": 11, "5": 12, 
			 "4": 13, "7": 14, "6": 15, "9": 16, "8": 17, "=": 18, 
			 "A": 19, "C": 20, "B": 21, "E": 22, "D": 23, "G": 24,
			 "F": 25, "I": 26, "H": 27, "K": 28, "M": 29, "L": 30, 
			 "O": 31, "N": 32, "P": 33, "S": 34, "R": 35, "U": 36, 
			 "T": 37, "W": 38, "V": 39, "Y": 40, "[": 41, "Z": 42, 
			 "]": 43, "_": 44, "a": 45, "c": 46, "b": 47, "e": 48, 
			 "d": 49, "g": 50, "f": 51, "i": 52, "h": 53, "m": 54, 
			 "l": 55, "o": 56, "n": 57, "s": 58, "r": 59, "u": 60,
			 "t": 61, "y": 62}

CHARCANSMILEN = 62

CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, 
				"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, 
				"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, 
				"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, 
				"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, 
				"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, 
				"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, 
				"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}

CHARISOSMILEN = 64


## ######################## ##
#
#  Encoding Helpers
#
## ######################## ## 

#  Y = -(np.log10(Y/(math.pow(math.e,9))))

def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind):
	X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1

	for i, ch in enumerate(line[:MAX_SMI_LEN]):
		X[i, (smi_ch_ind[ch]-1)] = 1 

	return X #.tolist()

def one_hot_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
	X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind))) 
	for i, ch in enumerate(line[:MAX_SEQ_LEN]):
		X[i, (smi_ch_ind[ch])-1] = 1

	return X #.tolist()

def one_hot_structure(line, MAX_SEQ_LEN, smi_ch_ind):
    X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind)))
    for i, ch in enumerate(line[:MAX_SEQ_LEN]):
        X[i, (smi_ch_ind[ch])-1] = 1

    return X 

def label_smiles(line, MAX_SMI_LEN, smi_ch_ind):
	X = np.zeros(MAX_SMI_LEN)
	for i, ch in enumerate(line[:MAX_SMI_LEN]): #	x, smi_ch_ind, y
		X[i] = smi_ch_ind[ch]

	return X #.tolist()

def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
	X = np.zeros(MAX_SEQ_LEN)

	for i, ch in enumerate(line[:MAX_SEQ_LEN]):
		X[i] = smi_ch_ind[ch]

	return X #.tolist()

def label_structure(line, MAX_SEQ_LEN, smi_ch_ind):
    X = np.zeros(MAX_SEQ_LEN)
    for i,ch in enumerate(line[:MAX_SEQ_LEN]):
        X[i] = smi_ch_ind[ch]

    return X

def pure_ligands_structure(line, MAX_SEQ_LEN):
    X = np.zeros(MAX_SEQ_LEN)
    line = line.split(",")
    for i, ch in enumerate(line[:MAX_SEQ_LEN]): 
        X[i] = int(ch)
    return X


## ######################## ##
#
#  DATASET Class
#
## ######################## ## 
# works for large dataset



def parse_data(is_log=True, with_label=True):
    fpath = 'data/davis/'
    print("Read %s start" % fpath)
    SMILEN = 85
    SEQLEN = 1200
    SMISTRULEN = 60
    charsmiset = CHARISOSMISET
    charseqset = CHARPROTSET
    charseqstruset = CHARPROTSTRUSET
    charseqstruset_size = CHARPROTSTRULEN

    ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
    proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
    proteins_structure = json.load(open(fpath+"protein_structure.txt"), object_pairs_hook=OrderedDict)
    ligands_structure = json.load(open(fpath+"morgan_fingerprint.txt"), object_pairs_hook=OrderedDict)

    Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
    if is_log:
        Y = -(np.log10(Y/(math.pow(10,9))))

    XD = []
    XT = []
    XT_STRU = []
    XD_STRU = []

    if with_label:
        for d in ligands.keys():
            XD.append(label_smiles(ligands[d], SMILEN, charsmiset))
            XD_STRU.append(pure_ligands_structure(ligands_structure[d], SMISTRULEN))

        for t in proteins.keys():
            XT.append(label_sequence(proteins[t], SEQLEN, charseqset))
            XT_STRU.append(label_structure(proteins_structure[t], SEQLEN, charseqstruset))
    else:
        for d in ligands.keys():
            XD.append(one_hot_smiles(ligands[d], SMILEN, charsmiset))
            XD_STRU.append(pure_ligands_structure(ligands_structure[d], SMISTRULEN))

        for t in proteins.keys():
            XT.append(one_hot_sequence(proteins[t], SEQLEN, charseqset))
            XT_STRU.append(one_hot_structure(proteins_structure[t], SEQLEN, charseqstruset))
  
    return XD, XD_STRU, XT, XT_STRU, Y

In [6]:
XD, XD_STRU, XT, XT_STRU, Y = parse_data(with_label=True)

Read data/davis/ start


In [7]:
generate_h5File_test(XD, XD_STRU, XT, XT_STRU, Y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


'SAVE OK'