In [8]:
import os
import pickle
import timeit

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score, precision_score, recall_score

In [9]:
class CompoundProteinInteractionPrediction(nn.Module):
    def __init__(self):
        super(CompoundProteinInteractionPrediction, self).__init__()
        self.embed_fingerprint = nn.Embedding(n_fingerprint, dim)
        self.embed_word = nn.Embedding(n_word, dim)

        # 定义每一层的图神经网络结构,全连接层，输入输出维度相同都是dim，并且创建layer_gnn个全连接层
        self.W_GNN = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layer_gnn)])

        # 定义长短时循环神经网络，这里我想替换成transformer，但是方法还不成熟，之后试试
        # 输入的参数依次是：词向量的维度，隐藏层的维度，堆叠的层数
        self.Bi_LSTM = nn.LSTM(dim, 5, 1, dropout=0.2, bidirectional=True)

        # 添加注意力机制，这部分参数来自图神经网络
        self.W_attention = nn.Linear(dim, dim)

        # 最终输出的结果
        self.W_out = nn.ModuleList([nn.Linear(2 * dim, 2 * dim) for _ in range(layer_output)])

        # 最最终的结果
        self.W_interaction = nn.Linear(2 * dim, 2)

    def gnn(self, fingerprint_vector, adjacency, layer_num):
        # 这里的layer_num只是作为一个数字，进行循环使用的，使用的数字就是上面函数中定义的layer_gnn的数量
        for i in range(layer_num):
            index = torch.relu(self.W_GNN[i](fingerprint_vector))
            fingerprint_vector = fingerprint_vector + torch.matmul(adjacency, index)
        return torch.unsqueeze(torch.mean(fingerprint_vector, 0), 0)

    def attention_cnn(self, compound_vector, word_vectors, layer_num):
        # 这里的layer_num的意义同上，这个也是循环次数，循环次数是layer_cnn的层数
        word_vectors = torch.unsqueeze(word_vectors, 0)

        # 作者的源代码就是直接进行了BiLSTM，并没有进行前面的Bert训练
        bilstm, _ = self.Bi_LSTM(word_vectors)
        bilstm = torch.squeeze(bilstm, 0)

        h = torch.relu(self.W_attention(compound_vector))
        hs = torch.relu(self.W_attention(bilstm))
        weights = torch.tanh(F.linear(h, hs))

        ys = torch.t(weights) * hs

        return torch.unsqueeze(torch.mean(ys, 0), 0)

    def forward(self, inputs):
        fingerprints, adjacency, word = inputs

        """药物分子处理"""
        fingerprint_vector = self.embed_fingerprint(fingerprints)
        compound_vector = self.gnn(fingerprint_vector, adjacency, layer_gnn)

        """蛋白序列处理"""
        word_vectors = self.embed_word(word)
        protein_vector = self.attention_cnn(compound_vector, word_vectors, layer_cnn)

        """将向量结合并输出交互结果"""
        cat_vector = torch.cat((compound_vector, protein_vector), 1)
        # layer_out目的和之前相同，仅作为数字进行循环使用
        for j in range(layer_output):
            cat_vector = torch.relu(self.W_out[j](cat_vector))
        interaction = self.W_interaction(cat_vector)
        return interaction

    def __call__(self, data, train=True):
        inputs, correct_interaction = data[:-1], data[-1]
        prediction_interaction = self.forward(inputs)

        if train:
            loss = F.cross_entropy(prediction_interaction, correct_interaction)
            return loss
        else:
            correct_labels = correct_interaction.to('cpu').data.numpy()
            ys = F.softmax(prediction_interaction, 1).to('cpu').data.numpy()
            predicted_labels = list(map(lambda x: np.argmax(x), ys))
            predicted_scores = list(map(lambda x: x[1], ys))
            return correct_labels, predicted_labels, predicted_scores


class Trainer(object):
    def __init__(self, model):
        self.model = model
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=weight_decay)

    def train(self, dataset):
        np.random.shuffle(dataset)
        N = len(dataset)
        loss_total = 0
        for data in dataset:
            loss = self.model(data)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            loss_total += loss.to('cpu').data.numpy()
        return loss_total


class Tester(object):
    def __init__(self, model):
        self.model = model

    def test(self, dataset):
        N = len(dataset)
        T, Y, S = [], [], []
        for data in dataset:
            (correct_labels, predicted_labels, predicted_scores) = self.model(data, train=False)
            T.append(correct_labels)
            Y.append(predicted_labels)
            S.append(predicted_scores)
        AUC = roc_auc_score(T, S)
        precision = precision_score(T, Y)
        recall = recall_score(T, Y)
        return AUC, precision, recall

    def save_AUCs(self, AUCs, filename):
        with open(filename, 'a') as f:
            f.write('\t'.join(map(str, AUCs)) + '\n')

    def save_model(self, model, filename):
        torch.save(model.state_dict(), filename)


def load_tensor(file_name, dtype):
    return [dtype(d).to(device) for d in np.load(file_name + '.npy', allow_pickle=True)]


def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)


def shuffle_dataset(dataset, seed):
    np.random.seed(seed)
    np.random.shuffle(dataset)
    return dataset


def split_dataset(dataset, ratio):
    n = int(ratio * len(dataset))
    dataset_1, dataset_2 = dataset[:n], dataset[n:]
    return dataset_1, dataset_2

In [10]:
if __name__ == "__main__":

    """Hyperparameters."""
    (DATASET, radius, ngram, dim, layer_gnn, window, layer_cnn, layer_output,
     lr, lr_decay, decay_interval, weight_decay, iteration,
     setting) = ['human', 2, 3, 10, 3, 11, 3, 3, 1e-3, 0.5, 10, 1e-6, 100,
                 'human--radius2--ngram3--dim10--layer_gnn3--window11--layer_cnn3--layer_output3--lr1e-3--lr_decay0.5--decay_interval10--weight_decay1e-6--iteration100']
    (dim, layer_gnn, window, layer_cnn, layer_output, decay_interval,
     iteration) = map(int, [dim, layer_gnn, window, layer_cnn, layer_output,
                            decay_interval, iteration])
    lr, lr_decay, weight_decay = map(float, [lr, lr_decay, weight_decay])

    """CPU or GPU."""
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('The code uses GPU...')
    else:
        device = torch.device('cpu')
        print('The code uses CPU!!!')

    """Load preprocessed data."""
    #    dir_input = ('../dataset/' + DATASET + '/input/'
    #                 'radius' + str(radius) + '_ngram' + str(ngram) + '/')
    # dir_input = ('../dataset/' + DATASET + '/input/for_train13/')
    dir_input = os.getcwd() + '\\dataset\\' + DATASET
    compounds = load_tensor(dir_input + '\\compounds', torch.LongTensor)
    adjacencies = load_tensor(dir_input + '\\adjacencies', torch.FloatTensor)
    proteins = load_tensor(dir_input + '\\proteins', torch.LongTensor)
    interactions = load_tensor(dir_input + '\\interactions', torch.LongTensor)
    fingerprint_dict = load_pickle(dir_input + '\\fingerprint_dict.pickle')
    word_dict = load_pickle(dir_input + '\\word_dict.pickle')
    n_fingerprint = len(fingerprint_dict)
    n_word = len(word_dict)

    """Create a dataset and split it into train/dev/test."""
    dataset = list(zip(compounds, adjacencies, proteins, interactions))
    dataset = shuffle_dataset(dataset, 1234)
    dataset_train, dataset_ = split_dataset(dataset, 0.8)
    dataset_dev, dataset_test = split_dataset(dataset_, 0.5)

    """Set a model."""
    torch.manual_seed(1234)
    model = CompoundProteinInteractionPrediction().to(device)
    trainer = Trainer(model)
    tester = Tester(model)

    """Output files."""
    output_path = os.getcwd() + '\\new_model_output\\'
    if not os.path.exists(output_path + 'human\\'):
        os.mkdir(output_path + 'human\\')
    file_AUCs = output_path + DATASET + '\\AUCs--' + setting + '.txt'
    file_model = output_path + DATASET + '\\' + setting
    AUCs = ('Epoch\tTime(sec)\tLoss_train\tAUC_dev\t'
            'AUC_test\tPrecision_test\tRecall_test')
    with open(file_AUCs, 'w') as f:
        f.write(AUCs + '\n')

    """Start training."""
    print('Training...(BiLSTM)')
    print(AUCs)
    start = timeit.default_timer()

    for epoch in range(1, iteration):

        if epoch % decay_interval == 0:
            trainer.optimizer.param_groups[0]['lr'] *= lr_decay

        loss_train = trainer.train(dataset_train)
        AUC_dev = tester.test(dataset_dev)[0]
        AUC_test, precision_test, recall_test = tester.test(dataset_test)

        end = timeit.default_timer()
        time = end - start

        AUCs = [epoch, time, loss_train, AUC_dev,
                AUC_test, precision_test, recall_test]
        tester.save_AUCs(AUCs, file_AUCs)
        tester.save_model(model, file_model)

        print('\t'.join(map(str, AUCs)))

The code uses GPU...
Training...(BiLSTM)
Epoch	Time(sec)	Loss_train	AUC_dev	AUC_test	Precision_test	Recall_test
1	24.063715900000034	2183.6258826682606	0.9381303174932558	0.9406742001015744	0.9169139465875371	0.8631284916201117


KeyboardInterrupt: 