In [2]:
!pip install rdkit
!pip install dgl-cu110 -f https://data.dgl.ai/wheels/repo.html
!pip install dgllife

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl-cu110
  Downloading https://data.dgl.ai/wheels/dgl_cu110-0.9.1.post1-cp39-cp39-manylinux1_x86_64.whl (230.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.7/230.7 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dgl-cu110
Successfully installed dgl-cu110-0.9.1.post1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Colle

In [12]:
import matplotlib.pyplot as plt
import os
from rdkit import Chem
from rdkit.Chem import rdmolops, rdmolfiles
from rdkit import RDPaths
 
import dgl
from dgl.nn.pytorch import NNConv
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
from dgl.nn.pytorch import Set2Set
from dgllife.utils import atom_type_one_hot
from dgllife.utils import atom_degree_one_hot
from dgllife.utils import atom_formal_charge
from dgllife.utils import atom_num_radical_electrons
from dgllife.utils import atom_hybridization_one_hot
from dgllife.utils import atom_total_num_H_one_hot
from dgllife.utils import one_hot_encoding
from dgllife.utils import CanonicalAtomFeaturizer
from dgllife.utils import CanonicalBondFeaturizer
from dgllife.utils import ConcatFeaturizer
from dgllife.utils import BaseAtomFeaturizer
from dgllife.utils import BaseBondFeaturizer
from dgllife.utils import bond_type_one_hot,bond_is_conjugated_one_hot,bond_stereo_one_hot,bond_direction_one_hot,bond_is_in_ring_one_hot
from dgllife.utils import one_hot_encoding 
from dgl.data.utils import split_dataset
from dgllife.utils import smiles_to_bigraph
from functools import partial
from sklearn.metrics import roc_auc_score
import csv
import time
import math
from sklearn.metrics import accuracy_score,precision_score,recall_score
from gensim.models import Word2Vec
import sys
sys.path.insert(0,'/content/drive/MyDrive')
from utils.losses import CostSensitiveLoss, CostSensitiveRegularizedLoss

In [3]:
class mydataset:
  def __init__(self, is_train_set=True):
    filename = '/content/drive/MyDrive/formaltrainL_cleaned.csv' if is_train_set else '/content/drive/MyDrive/formaltestL_cleaned.csv'
    with open(filename,"rt") as f:
      reader=csv.reader(f)
      rows=list(reader)
      self.smiles=[row[0] for row in rows]
      self.graphs=[smiles_to_bigraph(row[0],node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer) for row in rows]
      self.protein=[row[1] for row in rows]
      self.len_smiles=len(self.smiles)
      self.len_protein=len(self.protein)
      self.label=[row[2] for row in rows]
  def __getitem__(self, index):
    return self.smiles[index] ,self.protein[index] ,eval(self.label[index]), self.graphs[index]
  def __len__(self):
    return self.len_smiles

In [6]:
BATCH_SIZE = 16
N_EPOCHS = 10
N_CHARS = 128
USE_GPU = True
trainset = mydataset(is_train_set=True)
trainloader = dgl.dataloading.GraphDataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)
testset = mydataset(is_train_set=False)
testloader = dgl.dataloading.GraphDataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)
n_labels=2

In [4]:
def sequence2list(name):
  arr=[ord(c) for c in name]
  return arr
def chirality(atom):
    try:
        return one_hot_encoding(atom.GetProp('_CIPCode'), ['R', 'S']) + \
               [atom.HasProp('_ChiralityPossible')]
    except:
        return [False, False] + [atom.HasProp('_ChiralityPossible')]

atom_featurizer = BaseAtomFeaturizer(
                 {'hv': ConcatFeaturizer([
                  partial(atom_type_one_hot, allowable_set=[
                          'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At','Na','K'],
                    encode_unknown=True),
                  partial(atom_degree_one_hot, allowable_set=list(range(6))),
                  atom_formal_charge, atom_num_radical_electrons,
                  partial(atom_hybridization_one_hot, encode_unknown=True),
                  lambda atom: [0], # A placeholder for aromatic information,
                    atom_total_num_H_one_hot, chirality
                 ],
                )})
bond_featurizer = BaseBondFeaturizer({"he":ConcatFeaturizer([bond_type_one_hot,bond_is_conjugated_one_hot,bond_stereo_one_hot,bond_direction_one_hot,bond_is_in_ring_one_hot])})

def make_graph(smiles):
  smiles_list=[ smile for smile in smiles ]
  graphs=[smiles_to_bigraph(smile,node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer) for smile in smiles]
  bg=dgl.batch(graphs)
  atom_num=bg.batch_num_nodes().tolist()
  if USE_GPU == True:
    bg=bg.to('cuda:0')
  return bg,atom_num
def make_tensors(proteins,labels):
    protein_sequences = [sequence2list(protein) for protein in proteins]
    protein_seq_lengths = [len(seq) for seq in protein_sequences]
    labels=labels.long()
    # make tensor of protein, BatchSize x SeqLen
    protein_seq_tensor = torch.zeros(len(protein_sequences), max(protein_seq_lengths)).long()
    for idx, (seq, seq_len) in enumerate(zip(protein_sequences, protein_seq_lengths), 0):
      protein_seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
    
    return create_tensor(protein_seq_tensor),\
    protein_seq_lengths,\
    create_tensor(labels)

def get_graph_and_len(graphs):
  atom_num=graphs.batch_num_nodes().tolist()
  if USE_GPU == True:
    graphs=graphs.to('cuda:0')
  return graphs,atom_num

def create_tensor(tensor):
  if USE_GPU:
  #if False:
    device = torch.device("cuda:0")
    tensor = tensor.to(device)
  return tensor

In [5]:
class Encoder(nn.Module):
    """protein feature extraction."""
    def __init__(self, protein_dim, hid_dim, n_layers,kernel_size , dropout):
        super().__init__()

        assert kernel_size % 2 == 1, "Kernel size must be odd (for now)"

        self.input_dim = protein_dim
        self.hid_dim = hid_dim
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.n_layers = n_layers
        #self.pos_embedding = nn.Embedding(1000, hid_dim)
        self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(torch.device("cuda:0"))
        self.convs = nn.ModuleList([nn.Conv1d(hid_dim, 2*hid_dim, kernel_size, padding=(kernel_size-1)//2) for _ in range(self.n_layers)])   # convolutional layers
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(self.input_dim, self.hid_dim)
        self.gn = nn.GroupNorm(8, hid_dim * 2)
        self.ln = nn.LayerNorm(hid_dim)

    def forward(self, protein):
        #pos = torch.arange(0, protein.shape[1]).unsqueeze(0).repeat(protein.shape[0], 1).to(self.device)
        #protein = protein + self.pos_embedding(pos)
        #protein = [batch size, protein len,protein_dim]
        conv_input = self.fc(protein)
        # conv_input=[batch size,protein len,hid dim]
        #permute for convolutional layer
        conv_input = conv_input.permute(0, 2, 1)
        #conv_input = [batch size, hid dim, protein len]
        for i, conv in enumerate(self.convs):
            #pass through convolutional layer
            conved = conv(self.dropout(conv_input))
            #conved = [batch size, 2*hid dim, protein len]

            #pass through GLU activation function
            conved = F.glu(conved, dim=1)
            #conved = [batch size, hid dim, protein len]

            #apply residual connection / high way
            conved = (conved + conv_input) * self.scale
            #conved = [batch size, hid dim, protein len]

            #set conv_input to conved for next loop iteration
            conv_input = conved

        conved = conved.permute(0, 2, 1)
        # conved = [batch size,protein len,hid dim]
        conved = self.ln(conved)
        return conved

In [6]:
class MPNNGNN(nn.Module):
    """MPNN.
    MPNN is introduced in `Neural Message Passing for Quantum Chemistry
    <https://arxiv.org/abs/1704.01212>`__.
    This class performs message passing in MPNN and returns the updated node representations.
    Parameters
    ----------
    node_in_feats : int
        Size for the input node features.
    node_out_feats : int
        Size for the output node representations. Default to 64.
    edge_in_feats : int
        Size for the input edge features. Default to 128.
    edge_hidden_feats : int
        Size for the hidden edge representations.
    num_step_message_passing : int
        Number of message passing steps. Default to 6.
    """
    def __init__(self, node_in_feats, edge_in_feats, node_out_feats=34,
                 edge_hidden_feats=128, num_step_message_passing=6):
        super(MPNNGNN, self).__init__()

        self.project_node_feats = nn.Sequential(
            nn.Linear(node_in_feats, node_out_feats),
            nn.ReLU()
        )
        self.num_step_message_passing = num_step_message_passing
        edge_network = nn.Sequential(
            nn.Linear(edge_in_feats, edge_hidden_feats),
            nn.ReLU(),
            nn.Linear(edge_hidden_feats, node_out_feats * node_out_feats)
        )
        self.gnn_layer = NNConv(
            in_feats=node_out_feats,
            out_feats=node_out_feats,
            edge_func=edge_network,
            aggregator_type='sum'
        )
        self.gru = nn.GRU(node_out_feats, node_out_feats)
    def reset_parameters(self):
        """Reinitialize model parameters."""
        self.project_node_feats[0].reset_parameters()
        self.gnn_layer.reset_parameters()
        for layer in self.gnn_layer.edge_func:
            if isinstance(layer, nn.Linear):
                layer.reset_parameters()
        self.gru.reset_parameters()

    def forward(self, g, node_feats, edge_feats,node_out_feats=34):
        """Performs message passing and updates node representations.
        Parameters
        ----------
        g : DGLGraph
            DGLGraph for a batch of graphs.
        node_feats : float32 tensor of shape (V, node_in_feats)
            Input node features. V for the number of nodes in the batch of graphs.
        edge_feats : float32 tensor of shape (E, edge_in_feats)
            Input edge features. E for the number of edges in the batch of graphs.
        Returns
        -------
        node_feats : float32 tensor of shape (V, node_out_feats)
            Output node representations.
        """
        node_num_max=max(g.batch_num_nodes().tolist())
        mol_embedding=torch.zeros(g.batch_size,node_num_max,node_out_feats)
        node_feats = self.project_node_feats(node_feats) # (V, node_out_feats)
        hidden_feats = node_feats.unsqueeze(0)           # (1, V, node_out_feats)

        for _ in range(self.num_step_message_passing):
            node_feats = F.relu(self.gnn_layer(g, node_feats, edge_feats))
            node_feats, hidden_feats = self.gru(node_feats.unsqueeze(0), hidden_feats)
            node_feats = node_feats.squeeze(0)
        bg=g
        bg.ndata["hv"]=node_feats
        bg_ls=dgl.unbatch(g)
        for i in range(len(bg_ls)):
          mol_embedding[i,:bg_ls[i].num_nodes(),:]=bg_ls[i].ndata["hv"]
        return mol_embedding.to(torch.device("cuda:0"))

In [7]:
class SelfAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.n_heads = n_heads

        assert hid_dim % n_heads == 0

        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)

        self.fc = nn.Linear(hid_dim, hid_dim)

        self.do = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads])).to(torch.device("cuda:0"))

    def forward(self, query, key, value, mask=None):
        bsz = query.shape[0]

        # query = key = value [batch size, sent len, hid dim]

        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)

        # Q, K, V = [batch size, sent len, hid dim]

        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)

        # K, V = [batch size, n heads, sent len_K, hid dim // n heads]
        # Q = [batch size, n heads, sent len_q, hid dim // n heads]
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        # energy = [batch size, n heads, sent len_Q, sent len_K]
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = self.do(F.softmax(energy, dim=-1))

        # attention = [batch size, n heads, sent len_Q, sent len_K]

        x = torch.matmul(attention, V)

        # x = [batch size, n heads, sent len_Q, hid dim // n heads]

        x = x.permute(0, 2, 1, 3).contiguous().to(torch.device("cuda:0"))

        # x = [batch size, sent len_Q, n heads, hid dim // n heads]

        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))

        # x = [batch size, src sent len_Q, hid dim]

        x = self.fc((x))

        # x = [batch size, sent len_Q, hid dim]

        return x

In [8]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()

        self.hid_dim = hid_dim
        self.pf_dim = pf_dim

        self.fc_1 = nn.Conv1d(hid_dim, pf_dim, 1)  # convolution neural units
        self.fc_2 = nn.Conv1d(pf_dim, hid_dim, 1)  # convolution neural units

        self.do = nn.Dropout(dropout)

    def forward(self, x):
        # x = [batch size, sent len, hid dim]

        x = x.permute(0, 2, 1)

        # x = [batch size, hid dim, sent len]

        x = self.do(F.relu(self.fc_1(x)))

        # x = [batch size, pf dim, sent len]

        x = self.fc_2(x)

        # x = [batch size, hid dim, sent len]

        x = x.permute(0, 2, 1)

        # x = [batch size, sent len, hid dim]

        return x

In [9]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim , dropout):
        super().__init__()

        self.ln = nn.LayerNorm(hid_dim)
        self.sa = SelfAttention(hid_dim, n_heads, dropout)
        self.ea = SelfAttention(hid_dim, n_heads, dropout)
        self.pf = PositionwiseFeedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)

    def forward(self, trg, src, trg_mask=None, src_mask=None):
        # trg = [batch_size, compound len, atom_dim]
        # src = [batch_size, protein len, hid_dim] # encoder output
        # trg_mask = [batch size, compound sent len]
        # src_mask = [batch size, protein len]

        trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))

        trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))

        trg = self.ln(trg + self.do(self.pf(trg)))

        return trg

In [10]:
class Decoder(nn.Module):
    """ compound feature extraction."""
    def __init__(self, atom_dim, hid_dim, n_layers, n_heads, pf_dim, dropout):
        super().__init__()
        self.ln = nn.LayerNorm(hid_dim)
        self.output_dim = atom_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.dropout = dropout
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim , dropout) for _ in range(n_layers)])
        self.ft = nn.Linear(atom_dim, hid_dim)
        self.do = nn.Dropout(dropout)
        self.fc_1 = nn.Linear(hid_dim, 256)
        self.fc_2 = nn.Linear(256, 2)
        self.gn = nn.GroupNorm(8, 256)

    def forward(self, trg, src, trg_mask=None,src_mask=None):
        # trg = [batch_size, compound len, atom_dim]
        # src = [batch_size, protein len, hid_dim] # encoder output
        trg = self.ft(trg)

        # trg = [batch size, compound len, hid dim]

        for layer in self.layers:
            trg = layer(trg, src,trg_mask,src_mask)

        # trg = [batch size, compound len, hid dim]
        """Use norm to determine which atom is significant. """
        norm = torch.norm(trg, dim=2)
        # norm = [batch size,compound len]
        norm = F.softmax(norm, dim=1)
        # norm = [batch size,compound len]
        # trg = torch.squeeze(trg,dim=0)
        # norm = torch.squeeze(norm,dim=0)
        sum = torch.zeros((trg.shape[0], self.hid_dim)).to(torch.device("cuda:0"))
        for i in range(norm.shape[0]):
            for j in range(norm.shape[1]):
                v = trg[i, j, ]
                v = v * norm[i, j]
                sum[i, ] += v
        # sum = [batch size,hid_dim]
        label = F.relu(self.fc_1(sum))
        label = self.fc_2(label)
        return label

In [11]:
class TransformerClsassifier(nn.Module):
    def __init__(self,protein_dim, hid_dim, atom_dim, n_layers, n_heads, kernel_size, pf_dim, dropout, node_in_feats, edge_in_feats, node_out_feats=34,
                 edge_hidden_feats=128, num_step_message_passing=6,):
        super().__init__()

        self.encoder = Encoder(protein_dim, hid_dim, n_layers, kernel_size, dropout)
        self.decoder = Decoder(atom_dim, hid_dim, n_layers, n_heads, pf_dim, dropout)
        self.weight = nn.Parameter(torch.FloatTensor(atom_dim, atom_dim))
        self.init_weight()
        self.mpnn = MPNNGNN(node_in_feats, edge_in_feats, node_out_feats=34,edge_hidden_feats=128, num_step_message_passing=6)
        self.embedding = torch.nn.Embedding(128,100)
    def init_weight(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)

    def gcn(self, input, adj):
        # input =[batch,num_node, atom_dim]
        # adj = [batch,num_node, num_node]
        support = torch.matmul(input, self.weight)
        # support =[batch,num_node,atom_dim]
        output = torch.bmm(adj, support)
        # output = [batch,num_node,atom_dim]
        return output

    def make_masks(self, atom_num, protein_num, compound_max_len, protein_max_len):
        N = len(atom_num)  # batch size
        compound_mask = torch.zeros((N, compound_max_len))
        protein_mask = torch.zeros((N, protein_max_len))
        for i in range(N):
            compound_mask[i, :atom_num[i]] = 1
            protein_mask[i, :protein_num[i]] = 1
        compound_mask = compound_mask.unsqueeze(1).unsqueeze(3)
        protein_mask = protein_mask.unsqueeze(1).unsqueeze(2)
        return compound_mask.to(torch.device("cuda:0")), protein_mask.to(torch.device("cuda:0"))


    def forward(self, g,protein,atom_num,protein_num):
        # g, the graph of the batched compouds
        # compound = [batch,atom_num,atom_dim]
        # g.ndata["hv"] = [batched_atom_num, atom_dim]
        # protein = [batch,protein len, 100]
        compound = self.mpnn(g,g.ndata["hv"],g.edata["he"])
        compound_max_len = compound.shape[1]
        protein = self.embedding(protein)
        protein_max_len = protein.shape[1]
        compound_mask, protein_mask = self.make_masks(atom_num, protein_num, compound_max_len, protein_max_len)
        # compound = self.gcn(compound, adj)
        # compound = torch.unsqueeze(compound, dim=0)
        # compound = [batch size=1 ,atom_num, atom_dim]

        # protein = torch.unsqueeze(protein, dim=0)
        # protein =[ batch size=1,protein len, protein_dim]
        enc_src = self.encoder(protein)
        # enc_src = [batch size, protein len, hid dim]

        out = self.decoder(compound, enc_src, compound_mask, protein_mask)
        # out = [batch size, 2]
        # out = torch.squeeze(out, dim=0)
        return out

In [14]:
device=torch.device("cuda:0")
protein_dim = 100
atom_dim = 34
hid_dim = 64
n_layers = 2
n_heads = 8
pf_dim = 256
dropout = 0.1
kernel_size = 9
node_in_feats=41
node_out_feats=34
edge_in_feats=17
edge_hidden_feats=128
num_step_message_passing=6
filename = '/content/drive/MyDrive/formaltrainL_cleaned.csv'
with open(filename,"rt") as f:
  reader=csv.reader(f)
  rows=list(reader)
  smiles=[row[0] for row in rows]
  protein=[row[1] for row in rows]
  labels=[eval(row[2]) for row in rows]
smiles=smiles[:64]
protein=protein[:64]
labels=torch.LongTensor(labels[:64])
protein_inputs,protein_seq_lengths,target = make_tensors(protein,labels)
g,atom_num=make_graph(smiles)
mpnn=MPNNGNN(node_in_feats=node_in_feats,
      node_out_feats=node_out_feats,
      edge_in_feats=edge_in_feats,
      edge_hidden_feats=edge_hidden_feats,
      num_step_message_passing=num_step_message_passing)
mpnn=mpnn.to(device)
compound=mpnn(g,g.ndata["hv"],g.edata["he"])
compound_max_len = compound.shape[1]
embedding = torch.nn.Embedding(128,100).to(device)
protein = embedding(protein_inputs)
protein_max_len = protein.shape[1]

def make_masks(atom_num, protein_num, compound_max_len, protein_max_len):
        N = len(atom_num)  # batch size
        compound_mask = torch.zeros((N, compound_max_len))
        protein_mask = torch.zeros((N, protein_max_len))
        for i in range(N):
            compound_mask[i, :atom_num[i]] = 1
            protein_mask[i, :protein_num[i]] = 1
        compound_mask = compound_mask.unsqueeze(1).unsqueeze(3).to(device)
        protein_mask = protein_mask.unsqueeze(1).unsqueeze(2).to(device)
        return compound_mask, protein_mask

compound_mask, protein_mask = make_masks(atom_num, protein_seq_lengths, compound_max_len, protein_max_len)

encoder = Encoder(protein_dim, hid_dim, n_layers, kernel_size, dropout).to(device)
enc_src = encoder(protein)

ft = nn.Linear(atom_dim, hid_dim).to(device)

trg = ft(compound)
#layer=DecoderLayer(hid_dim, n_heads, pf_dim , dropout).to(device)
#trg = layer(trg, enc_src,compound_mask, protein_mask)
print(enc_src.device)
print(compound.device)
print(trg.device)
print(compound_mask.device)
print(protein_mask.device)
ln = nn.LayerNorm(hid_dim).to(device)
ea = SelfAttention(hid_dim, n_heads, dropout).to(device)
sa = SelfAttention(hid_dim, n_heads, dropout).to(device)
do = nn.Dropout(dropout).to(device)
pf = PositionwiseFeedforward(hid_dim, pf_dim, dropout).to(device)
trg = ln(trg + do(sa(trg, trg, trg, compound_mask)))
trg = ln(trg + do(ea(trg, enc_src, enc_src, protein_mask)))
trg = ln(trg + do(pf(trg)))

fc_1 = nn.Linear(hid_dim, 256).to(device)
fc_2 = nn.Linear(256, 2).to(device)

norm = torch.norm(trg, dim=2)
       
norm = F.softmax(norm, dim=1)
print(norm.device)      
sum = torch.zeros((trg.shape[0], hid_dim)).to(device)
for i in range(norm.shape[0]):
  for j in range(norm.shape[1]):
                v = trg[i, j, ]
                v = v * norm[i, j]
                sum[i, ] += v
        # sum = [batch size,hid_dim]
label = F.relu(fc_1(sum))
label = fc_2(label)



print(compound.shape,compound_max_len,protein.shape,protein_max_len,sep="\n")
print(compound_mask.shape,protein_mask.shape)
print(enc_src.shape)
print(label.device)

  assert input.numel() == input.storage().size(), "Cannot convert view " \


cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
cuda:0
torch.Size([64, 92, 34])
92
torch.Size([64, 1775, 100])
1775
torch.Size([64, 1, 92, 1]) torch.Size([64, 1, 1, 1775])
torch.Size([64, 1775, 64])
cuda:0


In [15]:
def time_since(since):
  s = time.time() - since
  m = math.floor(s / 60)
  s -= m * 60
  return '%dm %ds' % (m, s)

def trainModel():
  total_loss = 0
  for i, (smiles,protein,labels,graphs) in enumerate(trainloader, 1):
    protein_inputs,protein_seq_lengths,target = make_tensors(protein,labels)
    g,atom_num=get_graph_and_len(graphs)
    output = classifier(g,protein_inputs,atom_num,protein_seq_lengths)
    loss = criterion(output, target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    if i % 10 == 0:
      print(f'[{time_since(start)}] Epoch {epoch} ', end='')
      print(f'[{i * len(protein_inputs)}/{len(trainset)}] ', end='')
      print(f'loss={total_loss / (i * len(protein_inputs))}')
  return total_loss

In [16]:
def testModel():
  correct = 0
  accuracy_sum=0
  precision_sum=0
  recall_sum=0
  total = len(testset)
  print("evaluating trained model ...")
  with torch.no_grad():
    for i, (smiles,protein,labels,graphs) in enumerate(testloader, 1):
      protein_inputs,protein_seq_lengths,target = make_tensors(protein,labels)
      g,atom_num=get_graph_and_len(graphs)
      output = classifier(g,protein_inputs,atom_num,protein_seq_lengths)
      pred = output.max(dim=1, keepdim=True)[1]
      _,predresult = torch.max(output,1)
      predresult = predresult.tolist()
      target_list = target.tolist()
      accuracy_sum += accuracy_score(predresult,target_list)*len(predresult)
      correct += pred.eq(target.view_as(pred)).sum().item()
      precision_sum += precision_score(predresult,target_list)*len(predresult)
      recall_sum  += recall_score(predresult,target_list)*len(predresult)
    accuracy=accuracy_sum/total
    precision=precision_sum/total
    recall=recall_sum/total
    percent = '%.2f' % (100 * correct / total)
    print(f'Test set: Accuracy {correct}/{total} {percent}%')
    print("accuracy:",accuracy)
    print("precision:",precision)
    print("recall:",recall)
  return correct / total

In [None]:
protein_dim = 100
atom_dim = 34
hid_dim = 64
n_layers = 3
n_heads = 8
pf_dim = 256
dropout = 0.1
kernel_size = 9
node_in_feats=41
node_out_feats=34
edge_in_feats=17
edge_hidden_feats=128
num_step_message_passing=6
if __name__ == '__main__':
  if USE_GPU:
    device=torch.device("cuda:0")
  else:
    device=torch.device("cpu")
  classifier = TransformerClsassifier(protein_dim, hid_dim, atom_dim, n_layers, n_heads, kernel_size, pf_dim, dropout, node_in_feats, edge_in_feats)
  classifier.to(device)
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)
  base_loss = 'ce'
  lambd = 10
  #criterion = CostSensitiveRegularizedLoss(n_classes=2, base_loss=base_loss, lambd=lambd)
  #criterion = CostSensitiveLoss(2)
  start = time.time()
  print("Training for %d epochs..." % N_EPOCHS)
  acc_list = []
  for epoch in range(1, N_EPOCHS + 1):
  # Train cycle
    trainModel()
    acc = testModel()
    acc_list.append(acc)

Training for 10 epochs...


  assert input.numel() == input.storage().size(), "Cannot convert view " \


[0m 4s] Epoch 1 [160/131468] loss=0.026921712001785635
[0m 9s] Epoch 1 [320/131468] loss=0.02664096022490412
[0m 13s] Epoch 1 [480/131468] loss=0.022228752643180392
[0m 16s] Epoch 1 [640/131468] loss=0.02238082526018843
[0m 19s] Epoch 1 [800/131468] loss=0.021279379706829787
[0m 22s] Epoch 1 [960/131468] loss=0.020702884804146986
[0m 25s] Epoch 1 [1120/131468] loss=0.020806267677939364
[0m 28s] Epoch 1 [1280/131468] loss=0.020614285464398564
[0m 32s] Epoch 1 [1440/131468] loss=0.02100287632395824
[0m 34s] Epoch 1 [1600/131468] loss=0.020850423322990537
[0m 38s] Epoch 1 [1760/131468] loss=0.020735065812583674
[0m 41s] Epoch 1 [1920/131468] loss=0.02079524550354108
[0m 44s] Epoch 1 [2080/131468] loss=0.020695036467021474
[0m 47s] Epoch 1 [2240/131468] loss=0.020287898695096375
[0m 50s] Epoch 1 [2400/131468] loss=0.020110088993484775
[0m 54s] Epoch 1 [2560/131468] loss=0.020326282185851596
[0m 57s] Epoch 1 [2720/131468] loss=0.020479111398077187
[0m 59s] Epoch 1 [2880/131468] loss=0.02062

  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cannot convert view " \
  _warn_prf(average, modifier, msg_start, len(result))
  assert input.numel() == input.storage().size(), "Cann

In [None]:
torch.save(classifier.state_dict(),"/content/drive/MyDrive/my_model/transformer_ES_L_nlayer_3.pth")

In [17]:
protein_dim = 100
atom_dim = 34
hid_dim = 64
n_layers = 2
n_heads = 8
pf_dim = 256
dropout = 0.1
kernel_size = 9
node_in_feats=41
node_out_feats=34
edge_in_feats=17
edge_hidden_feats=128
num_step_message_passing=6
model1 = TransformerClsassifier(protein_dim, hid_dim, atom_dim, n_layers, n_heads, kernel_size, pf_dim, dropout, node_in_feats, edge_in_feats).to(device)
model1.load_state_dict(torch.load("/content/drive/MyDrive/my_model/transformer_ES.pth"))
model1.eval()

TransformerClsassifier(
  (encoder): Encoder(
    (convs): ModuleList(
      (0-1): 2 x Conv1d(64, 128, kernel_size=(9,), stride=(1,), padding=(4,))
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (fc): Linear(in_features=100, out_features=64, bias=True)
    (gn): GroupNorm(8, 128, eps=1e-05, affine=True)
    (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Decoder(
    (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-1): 2 x DecoderLayer(
        (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (sa): SelfAttention(
          (w_q): Linear(in_features=64, out_features=64, bias=True)
          (w_k): Linear(in_features=64, out_features=64, bias=True)
          (w_v): Linear(in_features=64, out_features=64, bias=True)
          (fc): Linear(in_features=64, out_features=64, bias=True)
          (do): Dropout(p=0.1, inplace=False)
        )
        (ea): SelfAttention(
          (w_q): Linear(i

In [13]:
protein_dim = 100
atom_dim = 34
hid_dim = 64
n_layers = 3
n_heads = 8
pf_dim = 256
dropout = 0.1
kernel_size = 9
node_in_feats=41
node_out_feats=34
edge_in_feats=17
edge_hidden_feats=128
num_step_message_passing=6
device=torch.device("cuda:0")
model2 = TransformerClsassifier(protein_dim, hid_dim, atom_dim, n_layers, n_heads, kernel_size, pf_dim, dropout, node_in_feats, edge_in_feats).to(device)
model2.load_state_dict(torch.load("/content/drive/MyDrive/my_model/transformer_ES_M.pth",map_location=device))
model2.eval()

TransformerClsassifier(
  (encoder): Encoder(
    (convs): ModuleList(
      (0-2): 3 x Conv1d(64, 128, kernel_size=(9,), stride=(1,), padding=(4,))
    )
    (dropout): Dropout(p=0.1, inplace=False)
    (fc): Linear(in_features=100, out_features=64, bias=True)
    (gn): GroupNorm(8, 128, eps=1e-05, affine=True)
    (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Decoder(
    (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0-2): 3 x DecoderLayer(
        (ln): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (sa): SelfAttention(
          (w_q): Linear(in_features=64, out_features=64, bias=True)
          (w_k): Linear(in_features=64, out_features=64, bias=True)
          (w_v): Linear(in_features=64, out_features=64, bias=True)
          (fc): Linear(in_features=64, out_features=64, bias=True)
          (do): Dropout(p=0.1, inplace=False)
        )
        (ea): SelfAttention(
          (w_q): Linear(i

In [15]:
USE_GPU=True
def make_tensors(proteins):
    protein_sequences = [sequence2list(protein) for protein in proteins]
    protein_seq_lengths = [len(seq) for seq in protein_sequences]
    # make tensor of protein, BatchSize x SeqLen
    protein_seq_tensor = torch.zeros(len(protein_sequences), max(protein_seq_lengths)).long()
    for idx, (seq, seq_len) in enumerate(zip(protein_sequences, protein_seq_lengths), 0):
      protein_seq_tensor[idx, :seq_len] = torch.LongTensor(seq)
    
    return create_tensor(protein_seq_tensor),\
    protein_seq_lengths

predata=[]
f1=open("/content/drive/MyDrive/donglab.csv","r")
for i in f1:
  i=i.replace("\n","")
  i=i.split(",")
  predata.append(i)
print(len(predata))
f1.close()
check=[]
smiles=[i[2] for i in predata]
proteins=[i[0] for i in predata]
graphs=[smiles_to_bigraph(smile,node_featurizer=atom_featurizer,edge_featurizer=bond_featurizer) for smile in smiles]
bg=dgl.batch(graphs)
g,atom_num=get_graph_and_len(bg)
proteins_inputs,protein_seq_lengths=make_tensors(proteins)
out=model2(g,proteins_inputs,atom_num,protein_seq_lengths)
result_list=torch.max(out,1)[1].tolist()
print(result_list)
print(len(result_list))
hitidlist=[]
for i in range(len(result_list)):
  if result_list[i]==1:
    hitidlist.append(predata[i][1])
print(hitidlist)
print(len(hitidlist))

90


  assert input.numel() == input.storage().size(), "Cannot convert view " \


[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
90
['O85697', 'Q9K498', 'Q82IV2', 'P29980', 'Q8YN84', 'Q8YMU4', 'A9FDB7', 'A9FZ85', 'A9ERX9', 'A0A0H5B5M5', 'A0A0A0UXK8', 'A0A0A0UXK3', 'Q9AJE5', 'Q5YNS8', 'Q5YUA6', 'O34374', 'Q825I8', 'Q82IY3', 'Q8YN07', 'Q2L6S8', 'Q2L6S5', 'Q59523', 'B5GLM9', 'B5GRC6', 'B5GS25', 'B5GTJ1', 'B5GTJ4', 'B5GW40', 'B5GW47', 'B5GZC3', 'B5H128', 'D5SJG9', 'D5SJH2', 'D5SK11', 'D5SL67', 'E2Q1P0', 'I0UZW1', 'I0V8F6', 'I0V8F7', 'I0V8F8', 'I0V8G0', 'I0V8G7', 'I0V8H0', 'A0A1I5LW45', 'A0A1I5LW58', 'A0A1I5LW72', 'A0A1I5LY42', 'A0A1H5RFX8', 'A0A1H5RGT5', 'C7Q393', 'A0A0H5NIJ4', 'A0A1S9NWT3', 'Q82L47', 'A4FEJ0', 'A4FFH4', 'A4FJG1', 'A0A291SKP4', 'A0A291SNN8', 'A0A291SRD7', 'A0A291SSX6', 'A0A1H4PBG0', 'A0A1H4QIF3', 'A0A1H4W3Y7', 'C9K1X6', 'C9K1X7', 'A0