In [43]:
import torch
import numpy as np
from torch.utils.data import TensorDataset, DataLoader

In [44]:
with np.load('/content/drive/My Drive/DL data/RNN/rnn-challenge-data.npz') as fs:
    data_x = fs['data_x']
    data_y = fs['data_y']
    valid_x = fs['val_x']
    valid_y = fs['val_y']
    test_x = fs['test_x']

for i in [data_x, data_y, valid_x, valid_y, test_x]:
    print(i.shape)

(400,)
(400,)
(100,)
(100,)
(250,)


In [45]:
print(data_x[:5])
print(data_y[:5])

['CTAGCTGAGCTACTGAGCTACAGTTGACTGACCAGTCAGTGCTAGCTACTGACAGTCTGACAGTTGACCTGACTGATGACCAGTCTAGCAGTGCTACTAGCTAGGCTACAGTCAGTTGACCAGTCTGACAGTCAGTCTGACTGACAGTCAGTCTAGGCTATGACCTGACTGATGACCTGACTGACTGACAGTCTGACTGATGACGCTATGACCTGACTAGCTAGCAGTTGACTGACCTGACAGTGCTACTAGCAGTTGACCAGTGCTACAGTCTGATGACTGACCTGACAGTCTAGGCTACAGTTGACCTGACAGTCAGTGCTACTGACAGTCTAGTGACCAGTCAGTCAGTTGACCTGACTAGCAGTTGACGCTATGACCAGTCTGACAGTGCTACTAG'
 'TGACGCTAGCTAGCTAAGTCACGTTGACAGTCACGTACGTAGTCACGTAGTCGCTAGCTAAGTCACGTGCTACAGTGCTATGACAGTCGCTAACGTTGACACGTACGTTGACCAGTGCTAGCTAGCTAGCTAACGTCAGTCAGTGCTATGACGCTAGCTATGACTGACGCTAACGTTGACTGACAGTCACGTAGTCAGTCACGTAGTCAGTCGCTAGCTAGCTAAGTCACGTCAGTACGTGCTAACGTTGACAGTCCAGTCAGTAGTCACGTACGTGCTAAGTCGCTAAGTCAGTCAGTCACGTACGTGCTAAGTCACGTCAGTCAGTAGTCAGTCACGTACGTACGTCAGTACGTAGTCCAGTGCTAGCTAACGTTGACGCTAACGTACGTAGTCACGT'
 'AGTCACGTAGTCAGTCGCTAGCTAACGTACGTCAGTGCTACAGTCAGTCAGTGCTAGCTAGCTAAGTCACGTTGACGCTACAGTCAGTCAGTACGTCAGTAGTCACGTTGACCAGTCAGTAGTCAGTCACGTTGACGCTAACGTAGTCAGTCACGTAGTCAGTCCAGTCAGTGCTAACGTAGTCGCTATG

In [46]:
vocab = ['A', 'C', 'G', 'T']   

def one_hot(vocab, seq):       # one hot encode train_x, val_x and test_x
    eye = np.eye(4)
    return np.array([eye[vocab.index(i)] for i in seq])

In [47]:
train_x = np.array([one_hot(vocab, i) for i in data_x])
val_x = np.array([one_hot(vocab, i) for i in valid_x])
test_x = np.array([one_hot(vocab, i) for i in test_x])
print(train_x.shape)
print(val_x.shape)
print(test_x.shape)
train_x = torch.Tensor(train_x)
val_x = torch.Tensor(val_x)
train_y = torch.Tensor(data_y)
val_y = torch.Tensor(valid_y)
test_x = torch.Tensor(test_x)

(400, 400, 4)
(100, 1200, 4)
(250, 2000, 4)


In [48]:
batch_size = 50
val_batch_size = 100
vocab = ["A", "C", "G", "T"]
n_classes = 5
n_hidden = 256

class LSTM(torch.nn.Module):
    def __init__(self, vocab, n_classes, n_hidden=256, n_layers=2):
        super().__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.vocab = vocab
        self.rnn = torch.nn.LSTM(input_size=len(vocab), hidden_size=n_hidden, num_layers=n_layers, batch_first=True)
        self.fc = torch.nn.Linear(n_hidden, n_classes)
    
    def forward(self, x, hidden):
        out, hidden = self.rnn(x, hidden)
        return self.fc(out), hidden

    def init_hidden(self, batch_size):
        return torch.zeros(2, self.n_layers, batch_size, self.n_hidden)

net = LSTM(vocab=vocab, n_classes=n_classes)

In [49]:
train = TensorDataset(train_x, train_y)
trainloader = DataLoader(train, batch_size=batch_size, shuffle=True)

val = TensorDataset(val_x,val_y)
valloader = DataLoader(val, batch_size=val_batch_size, shuffle=False)

In [50]:
optim = torch.optim.Adam(net.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()

In [51]:
def stretch_flat(tensor, times):           # used for labels in loss calculation
    assert len(tensor.shape) == 1
    return tensor.unsqueeze(0).repeat(times,1).T.flatten()

In [52]:
epochs = 5

for epoch in range(epochs):
    for i, data in enumerate(trainloader):
        h = net.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        inputs, labels = data
        net.zero_grad()
        pred, h = net(inputs, h)
        loss = loss_fn(pred.view(-1, n_classes), stretch_flat(labels, times=400).long())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(), 5)
        optim.step()

    print(f'iteration:{epoch}\tCross Entropy Loss:{loss.item()}')


iteration:0	Cross Entropy Loss:1.6081641912460327
iteration:1	Cross Entropy Loss:1.6095649003982544
iteration:2	Cross Entropy Loss:1.6092818975448608
iteration:3	Cross Entropy Loss:1.6106510162353516
iteration:4	Cross Entropy Loss:1.6108566522598267


In [53]:
train_acc = 0
train_accuracy = []

with torch.no_grad():
    for data in trainloader:
        h = net.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        inputs, labels = data
        output, h = net(inputs, h)
        output = torch.nn.LogSoftmax(dim=0)(output)
        #print(output[0])
        sums = output.sum(axis=1)
        #print(sums.shape)
        pred = sums.max(axis=1).indices
        train_acc += len((pred == labels).nonzero())
    train_accuracy.append(train_acc / len(data_x))

In [54]:
print(train_accuracy[-1])

0.905


In [55]:
val_acc = 0
val_accuracy = []

with torch.no_grad():
    for data in valloader:
        h = net.init_hidden(val_batch_size)
        h = tuple([each.data for each in h])
        inputs, labels = data
        output, h = net(inputs, h)
        output = torch.nn.LogSoftmax(dim=0)(output)
        sums = output.sum(axis=1)
        pred = sums.max(axis=1).indices
        val_acc += len((pred == labels).nonzero())
    val_accuracy.append(val_acc / len(val_x))

In [56]:
print(val_accuracy[-1])

0.95
