In [22]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
# import sys
import json
import operator
import numpy as np

In [39]:
path_root = ''
path_to_data = path_root + 'data/'

d = 20 # dimensionality of amino acid embeddings
n_units = 100 # RNN layer dimensionality
drop_rate = 0.3 # dropout
input_size = (4888, 989, 20)

padding_idx = 0
oov_idx = 1
batch_size = 32
nb_epochs = 10
my_patience = 2 # for early stopping strategy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device :", device)

Device : cuda


In [40]:
# Load files for ohe
graph_indicator = np.loadtxt("graph_indicator.txt", dtype=np.int64)
nodes = np.loadtxt("node_attributes.txt", delimiter=",")

In [41]:
amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

def create_dicts(sequence=amino_acids):
    """
    Create the dicts for the sequence embedding
    """
    word_to_index = dict(zip(sequence, range(1,21)))
    # invert mapping
    index_to_word =  {v : k for k, v in word_to_index.items()}
    return word_to_index, index_to_word

word_to_index, index_to_word = create_dicts()

In [42]:
# Read sequences
sequences = list()
with open('sequences.txt', 'r') as f:
    for line in f:
        sequences.append(line[:-1])

# Split data into training and test sets
sequences_train = list()
sequences_test = list()
train_ohe = list()
test_ohe = list()
proteins_test = list()
y_train = list()
with open('graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        ohe_vec = torch.Tensor([node[3:23] for node in nodes[np.where(graph_indicator==i)]])
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            sequences_test.append(sequences[i])
            test_ohe.append(ohe_vec)
            
        else:
            sequences_train.append(sequences[i])
            y_train.append(int(t[1][:-1]))
            train_ohe.append(ohe_vec)



train_ohe = pad_sequence(train_ohe).permute(1, 0, 2).long()
test_ohe = pad_sequence(test_ohe).permute(1, 0, 2).long()
pad_ = (0, 0, 0, 79)
test_ohe = F.pad(test_ohe, pad_, "constant", 0)
y_train = F.one_hot(torch.Tensor(y_train).long())

In [43]:
# Desired max length
max_len = 50

# 100 seqs of variable length (< max_len)
seq_lens = torch.randint(low=10,high=44,size=(100,))
seqs = [torch.rand(n) for n in seq_lens]
print(seqs[0].size(), len(seqs))
# pad first seq to desired length
seqs[0] = nn.ConstantPad1d((0, max_len - seqs[0].shape[0]), 0)(seqs[0])

# pad all seqs to desired length
seqs = pad_sequence(seqs)
print(seqs.size())

torch.Size([36]) 100
torch.Size([50, 100])


In [44]:
import numpy
import torch
from torch.utils.data import DataLoader, Dataset


class Dataset_(Dataset):
    def __init__(self, x, y):
        self.documents = x
        self.labels = y

    def __len__(self):
        return len(self.documents)

    def __getitem__(self, index):
        document = self.documents[index]
        label = self.labels[index] 
        sample = {
            "document": torch.tensor(document),
            "label": torch.tensor(label),
            }
        return sample


def get_loader(x, y, batch_size=32):
    dataset = Dataset_(x, y)
    data_loader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            pin_memory=True,
                            drop_last=True,
                            )
    return data_loader

In [45]:
class ProtCNN(nn.Module):    
    def __init__(self, index_to_word, embed_dim, dropout=0.5):
        super(ProtCNN, self).__init__()
#         self.embedding = nn.Embedding(num_embeddings=len(index_to_word)+2,
#                                           embedding_dim=d)
        self.bn1 = nn.BatchNorm1d(128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout = nn.Dropout(dropout)
        self.conv1 = nn.Conv1d(in_channels=989, out_channels=128, kernel_size=1, stride=1, dilation=1, padding='same')
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=1, stride=1, dilation=2, padding='same')
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=128, kernel_size=3, stride=1, dilation=3, padding='same')
        self.maxpool1d = nn.MaxPool1d(kernel_size=3, stride=None, padding=0, dilation=1, return_indices=False, ceil_mode=False)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(128, 18)
        
        
    def residual_block(self, x_in):
        """
        _data: input
        _filters: convolution filters
        _d_rate: dilation rate
        """

        shortcut = x_in
        
        x = self.bn1(x_in)
        x = self.relu(x)
        x = self.conv2(x)

        #bottleneck convolution
        x = self.bn2(x)
        x = self.relu(x)
        out = self.conv3(x)

        #skip connection
        out += shortcut
        return x
    
    def forward(self, x_in):
#       x = self.embedding(x_in)

        x = self.conv1(x_in)
        x = self.residual_block(x)
        print(x.size())
        #x = residual_block(x)
        x = self.maxpool1d(x)
        print(x.size())
        out = self.dropout(x)
       
        # softmax classifier
        out =  F.log_softmax(out, dim=1)
        
        return out

In [46]:
a = torch.ones(2, 4)
b = torch.ones(4, 4)
c = torch.ones(7, 4)
print(a, b, c)
print(pad_sequence([a, b, b]))

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]]) tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]) tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])
tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[0., 0., 0., 0.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[0., 0., 0., 0.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])


In [47]:
from tqdm import tqdm
model = ProtCNN(index_to_word, n_units).to(device)
model = model.double()
lr = 0.001  # learning rate
criterion = nn.CrossEntropyLoss()# fill the gap, use Binary cross entropy from torch.nn: https://pytorch.org/docs/stable/nn.html#loss-functions
optimizer = torch.optim.Adam(model.parameters(), lr=lr) #fill me

def train(x_train=train_ohe,
          y_train=y_train,
          x_test=test_ohe,
          word_dict=index_to_word,
          batch_size=batch_size):
  
    train_data = get_loader(x_train, y_train, batch_size)

    best_loss = np.inf
    p = 0 # patience

    for epoch in range(1, nb_epochs + 1): 
        losses = []
        accuracies = []
        with tqdm(train_data, unit="batch") as tepoch:
            for idx, data in enumerate(tepoch):
                tepoch.set_description(f"Epoch {epoch}")
                model.train()
                optimizer.zero_grad()
                input = data['document'].to(device, dtype=torch.double)
                label = data['label'].to(device)
                label = label.double()
                output = model.forward(input)[0]
                print(output.size(), label.size())
                loss = criterion(output, label) # fill the gap # compute the loss
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) # prevent exploding gradient 
                optimizer.step()

                losses.append(loss.item())
#                 accuracy = torch.sum(torch.round(output) == label).item() / batch_size
#                 accuracies.append(accuracy)
#                 tepoch.set_postfix(loss=sum(losses)/len(losses), accuracy=100. * sum(accuracies)/len(accuracies))

#         train_acc = evaluate_accuracy(train_data, False)
        #test_acc = evaluate_accuracy(test_data, False)
        print("===> Epoch {} Complete: Avg. Loss: {:.4f}"
              .format(epoch, sum(losses)/len(losses)))
        train_loss = sum(losses)/len(losses)
        if train_loss <= best_loss:
            best_loss = train_loss
            print("Train Loss improved, saving model...")
            torch.save(model.state_dict(), './best_model.pt')
            p = 0
#         else:
#             p += 1
#             if p==my_patience:
#                 print("Validation accuracy did not improve for {} epochs, stopping training...".format(my_patience))
#     print("Loading best checkpoint...")    
#     model.load_state_dict(torch.load('./best_model.pt'))
#     model.eval()
    print('done.')

train()

  "document": torch.tensor(document),
  "label": torch.tensor(label),
Epoch 1:   0%|                                                                               | 0/152 [00:00<?, ?batch/s]

torch.Size([32, 128, 20])
torch.Size([32, 128, 6])
torch.Size([128, 6]) torch.Size([32, 18])





ValueError: Expected input batch_size (128) to match target batch_size (32).