## prepare datasets

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

In [342]:
# retrieve sequences and species as label
from Bio import SeqIO

infile = 'mature.fa'
# infile = 'hairpin.fa'

def iterate_data(infile:str):
    with open(infile, 'r') as f:
        parser = SeqIO.parse(f, 'fasta')
        for rec in parser:
            seq = str(rec.seq)
            specie = ' '.join(rec.description.split(' ')[2:4])
            yield seq, specie

data_iter = iterate_data(infile)
seq, specie = next(data_iter)
print(seq, specie)

UGAGGUAGUAGGUUGUAUAGUU Caenorhabditis elegans


In [None]:
train_data = []
for seq, specie in iterate_data(infile):
    train_data.append((specie, ''.join(seq)))
print(train_data[0])


In [368]:
class MyEmbedding:
    def __init__(self, data:list):
        self.data = data
        self.train_dataset, self.valid_dataset = None, None
    
    def split(self):
        num_train = round(len(self.data)*.8)
        num_valid = len(self.data) - num_train
        self.train_dataset, self.valid_dataset = random_split(self.data, [num_train, num_valid])
        print(self.train_dataset[0])
        print(len(self.train_dataset), type(self.train_dataset))
        return self.train_dataset, self.valid_dataset

torch.manual_seed(1)
me = MyEmbedding(train_data)
me.split()

([21, 8, 7, 2, 18, 8, 5, 7, 8, 3, 10, 2, 10, 0], [4, 3, 3, 3, 3, 4, 3, 4, 5, 4, 4, 2, 5, 2, 3, 2, 5, 2, 4, 5, 4, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
39108 <class 'torch.utils.data.dataset.Subset'>


(<torch.utils.data.dataset.Subset at 0x75d48c937040>,
 <torch.utils.data.dataset.Subset at 0x75d48c92a880>)

In [344]:
train_dataset = []
for seq, specie in iterate_data(infile):
    train_dataset.append((specie, ''.join(seq)))
print(train_dataset[0])

torch.manual_seed(1)
# args:dataset: list type
num_train = round(len(train_dataset)*.8)
num_valid = len(train_dataset) - num_train
train_dataset, valid_dataset = random_split(train_dataset, [num_train, num_valid])
print(train_dataset[0])
print(len(train_dataset), type(train_dataset))

('Caenorhabditis elegans', 'UGAGGUAGUAGGUUGUAUAGUU')
('Melibe leonina', 'AGGGGAGACAAUCUGUCUACAUG')
39108 <class 'torch.utils.data.dataset.Subset'>


In [345]:
## Step 2 tokenization: unique tokens (words)
from collections import Counter

# slice sentence by word.
def tokenizer(text:str):
    return list(text)
#     step, res = 6, []
#     for i in range(0, len(text)-step+1):
#         res.append(text[i:i+step])
#     return res

# count tokens
token_counts = Counter()
for label, line in train_dataset:
    tokens = tokenizer(line)
    # words in list type
    token_counts.update(tokens)

print('A sentence converted to tokens:', line, tokens)
print('Vocab-size:', len(token_counts))

A sentence converted to tokens: UGUGUGUUCCGCUUCUUCUUU ['U', 'G', 'U', 'G', 'U', 'G', 'U', 'U', 'C', 'C', 'G', 'C', 'U', 'U', 'C', 'U', 'U', 'C', 'U', 'U', 'U']
Vocab-size: 4


In [346]:
# count tokens of output
label_token_counts = Counter()
for label, line in train_dataset:
    # words in list type
    label_token_counts.update([label,])

print('Vocab-size of labels:', len(label_token_counts))

Vocab-size of labels: 262


In [347]:
# sort token couts
from collections import OrderedDict

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
# descending sort counts
ordered_dict = OrderedDict(sorted_by_freq_tuples)
print(ordered_dict)
counts = list(ordered_dict.values())
print('counts:', counts)

OrderedDict([('U', 243537), ('G', 218676), ('A', 205596), ('C', 186588)])
counts: [243537, 218676, 205596, 186588]


In [348]:
label_sorted_by_freq_tuples = sorted(label_token_counts.items(), key=lambda x: x[1], reverse=True)
# descending sort counts
label_ordered_dict = OrderedDict(label_sorted_by_freq_tuples)
# print(label_ordered_dict)
label_counts = list(label_ordered_dict.values())
print('counts:', label_counts)

counts: [2098, 1579, 989, 904, 817, 721, 627, 606, 597, 597, 564, 543, 520, 519, 504, 500, 492, 473, 467, 466, 463, 458, 440, 439, 416, 392, 392, 390, 375, 375, 369, 368, 356, 355, 347, 341, 337, 328, 328, 326, 321, 312, 304, 304, 301, 295, 293, 285, 281, 271, 270, 260, 250, 242, 240, 238, 227, 220, 209, 204, 204, 202, 199, 180, 179, 178, 177, 176, 175, 170, 167, 157, 154, 154, 152, 151, 151, 148, 147, 147, 145, 144, 143, 143, 142, 138, 135, 135, 133, 133, 131, 127, 125, 125, 123, 122, 120, 120, 118, 117, 116, 113, 110, 109, 108, 104, 103, 102, 102, 100, 100, 97, 96, 95, 93, 90, 88, 83, 81, 81, 80, 78, 78, 76, 74, 74, 73, 71, 71, 70, 67, 66, 66, 66, 64, 63, 61, 60, 57, 57, 57, 56, 56, 56, 56, 55, 55, 54, 54, 53, 51, 51, 50, 46, 43, 43, 41, 40, 40, 39, 38, 38, 38, 35, 35, 34, 33, 33, 32, 30, 30, 30, 30, 29, 28, 26, 25, 25, 25, 24, 24, 24, 22, 21, 21, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6,

In [349]:
## Step 3 encoding: encoding each unique token into integers
from torchtext.vocab import vocab

# convert count value to index value (ranking)
input_vocab = vocab(ordered_dict)
input_vocab.insert_token("<pad>", 0)
input_vocab.insert_token("<unk>", 1)
# default token is "<unk>"
input_vocab.set_default_index(1)
print(input_vocab)

Vocab()


In [350]:
# convert count value to index value (ranking)
label_vocab = vocab(label_ordered_dict)
# print(label_ordered_dict)
label_vocab.insert_token("<pad>", 0)
label_vocab.insert_token("<unk>", 1)
# default token is "<unk>"
label_vocab.set_default_index(1)
for s in top_species+ ['<pad>', '<unk>']:
    print(s, label_vocab[s])

Homo sapiens 2
Mus musculus 3
Gallus gallus 4
Monodelphis domestica 5
<pad> 0
<unk> 1


In [351]:
## Step 3-A: define the functions for transformation

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# feed a tokens vector representing one sentence 
text_pipeline = lambda x: [input_vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: label_vocab[x] if x in top_species else 1

# # format label: label = 1, 0
# import torchtext
# from torchtext import __version__ as torchtext_version
# from pkg_resources import parse_version
# if parse_version(torchtext.__version__) > parse_version("0.10"):
#     label_pipeline = lambda x: 1. if x == 2 else 0.         # 1 ~ negative, 2 ~ positive review
# else:
#     label_pipeline = lambda x: 1. if x == 'Homo sapiens' else 0.

In [352]:
## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float32)
    lengths = torch.tensor(lengths)
    # padding is appended to the end of token vector of each sentence
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
#     print(label_list)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [353]:
train_dataset[0]

('Melibe leonina', 'AGGGGAGACAAUCUGUCUACAUG')

In [354]:
# study dataloader: load dataset into dataloader with collate function
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
print(type(dataloader))

##Observe data dimensions: Take a small batch
text_batch, label_batch, length_batch = next(iter(dataloader))
# one batch = one sentence (218 words)
print('input:', text_batch.shape)
print('output:', label_batch)
print('batch size=', length_batch)

print(text_batch)

<class 'torch.utils.data.dataloader.DataLoader'>
input: torch.Size([4, 23])
output: tensor([1., 1., 1., 1.], device='cuda:0')
batch size= tensor([23, 21, 21, 21], device='cuda:0')
tensor([[4, 3, 3, 3, 3, 4, 3, 4, 5, 4, 4, 2, 5, 2, 3, 2, 5, 2, 4, 5, 4, 2, 3],
        [2, 3, 5, 5, 2, 3, 3, 5, 2, 5, 5, 5, 2, 3, 2, 4, 2, 3, 5, 5, 4, 0, 0],
        [3, 4, 4, 3, 2, 3, 2, 3, 5, 2, 3, 2, 3, 3, 2, 3, 2, 3, 2, 5, 2, 0, 0],
        [2, 4, 2, 2, 5, 3, 4, 3, 5, 5, 4, 4, 2, 4, 4, 3, 2, 2, 5, 3, 3, 0, 0]],
       device='cuda:0')


In [355]:
## Step 4: batching the datasets. shuffle data for each epoch
batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [356]:
class RNN(nn.Module):
    def __init__(self, input_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_dim, padding_idx=0) 
        # model: long-short term memory
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        # Packs a Tensor containing padded sequences of variable length.
        lengths = lengths.cpu().numpy()
        out = nn.utils.rnn.pack_padded_sequence(out, lengths, batch_first=True, enforce_sorted=False)
        # 
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(input_vocab)
print(embedding_dim)
embed_dim = 24
rnn_hidden_size = 32
fc_hidden_size = 32

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)

35


In [357]:
def train(model, dataloader, loss_fn, optimizer):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(model, dataloader, loss_fn, optimizer):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [358]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(model, train_dl, loss_fn, optimizer)
    acc_valid, loss_valid = evaluate(model, valid_dl, loss_fn, optimizer)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
 

Epoch 0 accuracy: 0.8576 val_accuracy: 0.8530
Epoch 1 accuracy: 0.8576 val_accuracy: 0.8530
Epoch 2 accuracy: 0.8576 val_accuracy: 0.8530
Epoch 3 accuracy: 0.8576 val_accuracy: 0.8530
Epoch 4 accuracy: 0.8576 val_accuracy: 0.8530


In [335]:
print(model.embedding)
model.embedding.weight

Embedding(4098, 24, padding_idx=0)


Parameter containing:
tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.1017e+00, -1.7594e-01, -2.2456e+00,  ..., -1.0373e+00,
          1.5748e+00, -6.2985e-01],
        [-9.1478e-01,  5.4887e-01,  4.5846e-02,  ...,  1.1136e-03,
          1.6815e+00, -1.6467e-02],
        ...,
        [ 6.7938e-01,  8.2085e-02,  2.1721e-01,  ...,  2.1112e-01,
          1.5892e+00,  2.2728e-01],
        [ 7.3429e-01, -5.5941e-01, -3.9502e-01,  ..., -4.6106e-02,
          9.6702e-01,  1.0417e+00],
        [-3.5593e-01,  2.0741e+00, -1.5004e-01,  ..., -7.1639e-01,
          1.3424e+00,  5.3942e-01]], device='cuda:0', requires_grad=True)

In [336]:
model.to

<bound method Module.to of RNN(
  (embedding): Embedding(4098, 24, padding_idx=0)
  (rnn): LSTM(24, 32, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=32, out_features=32, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)>

## pre-trained RNN

In [340]:
# dataset
import random

pre_train_dataset = []
for seq, specie in iterate_data(infile):
    pre_train_dataset.append((specie, seq))
    random.shuffle(list(seq))
    pre_train_dataset.append(('pseudo', ''.join(seq)))
print(pre_train_dataset[0])

torch.manual_seed(1)
# args:dataset: list type
num_train = round(len(pre_train_dataset)*.8)
num_valid = len(pre_train_dataset) - num_train
pre_train_dataset, pre_valid_dataset = random_split(pre_train_dataset, [num_train, num_valid])
print(pre_train_dataset[0])
print(len(pre_train_dataset), type(pre_train_dataset))

('Caenorhabditis elegans', 'UGAGGUAGUAGGUUGUAUAGUU')
('pseudo', 'CGGGGCAGCUCAGUACAAGACG')
78216 <class 'torch.utils.data.dataset.Subset'>


In [359]:
## Step 2 tokenization: unique tokens (words)
from collections import Counter

# slice sentence by word.
def tokenizer(text:str):
    return list(text)

# count tokens
pre_token_counts = Counter()
for label, line in pre_train_dataset:
    tokens = tokenizer(line)
    # words in list type
    pre_token_counts.update(tokens)

print('A sentence converted to tokens:', line, tokens)
print('Vocab-size:', len(pre_token_counts))

A sentence converted to tokens: GGUGGAUAUUCCUUCUAUGUUU ['G', 'G', 'U', 'G', 'G', 'A', 'U', 'A', 'U', 'U', 'C', 'C', 'U', 'U', 'C', 'U', 'A', 'U', 'G', 'U', 'U', 'U']
Vocab-size: 4


In [360]:
# count tokens of output
pre_label_token_counts = Counter()
for label, line in pre_train_dataset:
    # words in list type
    pre_label_token_counts.update([label,])

print('Vocab-size of labels:', len(pre_label_token_counts))

Vocab-size of labels: 261


In [361]:
# sort token couts
from collections import OrderedDict

pre_sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
# descending sort counts
pre_ordered_dict = OrderedDict(sorted_by_freq_tuples)
print(pre_ordered_dict)
pre_counts = list(pre_ordered_dict.values())
print('counts:', pre_counts)

OrderedDict([('U', 243537), ('G', 218676), ('A', 205596), ('C', 186588)])
counts: [243537, 218676, 205596, 186588]


In [364]:
pre_label_sorted_by_freq_tuples = sorted(pre_label_token_counts.items(), key=lambda x: x[1], reverse=True)
# descending sort counts
pre_label_ordered_dict = OrderedDict(pre_label_sorted_by_freq_tuples)
# print(label_ordered_dict)
pre_label_counts = list(pre_label_ordered_dict.values())
print('counts:', pre_label_counts)

counts: [39039, 2120, 1598, 1006, 914, 836, 737, 621, 612, 599, 585, 540, 540, 524, 523, 520, 502, 490, 480, 472, 468, 466, 465, 458, 443, 426, 415, 402, 382, 377, 376, 370, 365, 363, 359, 351, 348, 341, 334, 327, 315, 311, 311, 309, 298, 292, 290, 279, 274, 266, 265, 261, 260, 257, 248, 242, 239, 232, 226, 209, 206, 205, 201, 193, 192, 186, 177, 177, 174, 173, 171, 166, 161, 159, 153, 151, 147, 147, 146, 146, 144, 143, 140, 140, 140, 139, 136, 134, 133, 133, 133, 129, 129, 128, 126, 126, 125, 123, 121, 119, 118, 117, 117, 113, 111, 111, 106, 100, 98, 96, 96, 96, 95, 95, 95, 95, 93, 93, 88, 83, 83, 82, 80, 78, 76, 75, 73, 71, 69, 66, 64, 64, 63, 61, 61, 60, 60, 59, 59, 58, 58, 57, 56, 56, 55, 55, 53, 52, 52, 51, 51, 50, 50, 48, 47, 45, 42, 42, 42, 42, 41, 41, 39, 39, 37, 36, 35, 33, 31, 30, 29, 28, 28, 28, 26, 25, 24, 24, 24, 24, 24, 23, 23, 21, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 8, 7, 7, 7, 7, 7, 7, 6, 5, 

In [365]:
## Step 3 encoding: encoding each unique token into integers
from torchtext.vocab import vocab

# convert count value to index value (ranking)
pre_input_vocab = vocab(pre_ordered_dict)
pre_input_vocab.insert_token("<pad>", 0)
pre_input_vocab.insert_token("<unk>", 1)
# default token is "<unk>"
pre_input_vocab.set_default_index(1)
print(pre_input_vocab)

Vocab()


In [None]:
# convert count value to index value (ranking)
pre_label_vocab = vocab(pre_label_ordered_dict)
# print(label_ordered_dict)
label_vocab.insert_token("<pad>", 0)
label_vocab.insert_token("<unk>", 1)
# default token is "<unk>"
label_vocab.set_default_index(1)
for s in top_species+ ['<pad>', '<unk>']:
    print(s, label_vocab[s])

In [308]:
class RNN1(nn.Module):
    def __init__(self, input_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_dim, padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()

    def forward(self, text, lengths):
        out = self.embedding(text)
        # Packs a Tensor containing padded sequences of variable length.
        lengths = lengths.cpu().numpy()
        out = nn.utils.rnn.pack_padded_sequence(out, lengths, batch_first=True, enforce_sorted=False)
        # 
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        return out
         
vocab_size = len(input_vocab)
print(embedding_dim)
embed_dim = 24
rnn_hidden_size = 32
fc_hidden_size = 32

torch.manual_seed(1)
model1 = RNN1(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model1 = model1.to(device)

2355


In [309]:
class RNN2(nn.Module):
    def __init__(self, model1, input_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = model1.embedding
        # model: long-short term memory
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, num_layers=2, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        # Packs a Tensor containing padded sequences of variable length.
        lengths = lengths.cpu().numpy()
        out = nn.utils.rnn.pack_padded_sequence(out, lengths, batch_first=True, enforce_sorted=False)
        # 
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
        
model2 = RNN2(model1, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model2 = model2.to(device)

2355


In [314]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    train(model1, train_dl, loss_fn, optimizer)
    acc_train, loss_train = train(model2, train_dl, loss_fn, optimizer)
    acc_valid, loss_valid = evaluate(model2, valid_dl, loss_fn, optimizer)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
 

Epoch 0 accuracy: 0.8683 val_accuracy: 0.8665
Epoch 1 accuracy: 0.8683 val_accuracy: 0.8665
Epoch 2 accuracy: 0.8683 val_accuracy: 0.8665
Epoch 3 accuracy: 0.8683 val_accuracy: 0.8665
Epoch 4 accuracy: 0.8683 val_accuracy: 0.8665


In [312]:
print(model1.embedding)
model1.embedding.weight

Embedding(4928, 24, padding_idx=0)


Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.1017, -0.1759, -2.2456,  ..., -1.0373,  1.5748, -0.6298],
        [-0.9274,  0.5451,  0.0663,  ..., -0.0075,  1.6734,  0.0103],
        ...,
        [-2.5470,  0.4472,  1.2326,  ..., -1.1731, -0.8936,  0.6546],
        [-1.1430, -1.4456, -1.2113,  ..., -1.2380, -0.1032,  1.8463],
        [-1.4777, -0.0540, -1.4073,  ..., -1.6092,  0.5272,  1.4486]],
       device='cuda:0', requires_grad=True)

In [313]:
print(model2.embedding)
model2.embedding.weight

Embedding(4928, 24, padding_idx=0)


Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.1017, -0.1759, -2.2456,  ..., -1.0373,  1.5748, -0.6298],
        [-0.9274,  0.5451,  0.0663,  ..., -0.0075,  1.6734,  0.0103],
        ...,
        [-2.5470,  0.4472,  1.2326,  ..., -1.1731, -0.8936,  0.6546],
        [-1.1430, -1.4456, -1.2113,  ..., -1.2380, -0.1032,  1.8463],
        [-1.4777, -0.0540, -1.4073,  ..., -1.6092,  0.5272,  1.4486]],
       device='cuda:0', requires_grad=True)

In [22]:
from typing import Iterable
from collections import Counter, OrderedDict
from torchtext.vocab import vocab

class VocabSeq:
    def __init__(self):
        self.input =  Counter()
        self.input_vocab = None
        self.label =  Counter()
        self.label_vocab = None
    
    def vocab(self, data_iter:Iterable):
        # tokenizer
        for seq_nt, labels in data_iter:
            self.input.update(seq_nt)
            self.label.update(labels)
        # encode
        self.encode_input()
        self.encode_output()
        return self.input_vocab, self.label_vocab
    
    def encode_input(self):
        ordered_input = sorted(self.input.items(), key=lambda x: x[1], reverse=True)
        ordered_input = OrderedDict(ordered_input)
        self.input_vocab = vocab(ordered_input)
        self.input_vocab.insert_token("<pad>", 0)
        self.input_vocab.insert_token("<unk>", 1)
        self.input_vocab.set_default_index(1)
    
    def encode_output(self):
        ordered_label = sorted(self.label.items(), key=lambda x: x[1], reverse=True)
        ordered_label = OrderedDict(ordered_label)
        self.label_vocab = vocab(ordered_label)
# 
coder = VocabSeq()
data_iter = iterate_data(infile)
input_vocab, label_vocab = coder.vocab(data_iter)

In [23]:
# split dataset
from torch.utils.data.dataset import random_split
data_iter = iterate_data(infile)
train_data = []
for item in data_iter:
    # input
    input_vector = [input_vocab[i] for i in item[0]]
    input_vector += [0] * (embedding_dim - len(input_vector))
    # label
    label_vector = [label_vocab[i] for i in item[1]]
    #     
    train_data.append((label_vector, input_vector))
    
num_train = round(len(train_data)*.8)
num_valid = len(train_data) - num_train
train_dataset, valid_dataset = random_split(train_data, [num_train, num_valid])

In [27]:
print(len(train_dataset), type(train_dataset))
print(train_dataset[0], len(train_dataset[0]))

39108 <class 'torch.utils.data.dataset.Subset'>
([27, 10, 3, 13, 14, 8, 7, 8, 1, 5, 15, 0, 12, 18, 2, 0, 8], [2, 4, 4, 4, 3, 5, 2, 4, 3, 4, 2, 2, 4, 5, 5, 4, 4, 4, 3, 5, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) 2


In [25]:
## define the functions for transformation

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _label, _text in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    lengths = torch.tensor(lengths)
    # padding is appended to the end of token vector of each sentence
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [29]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)
print(type(dataloader))

##Observe data dimensions: Take a small batch
text_batch, label_batch, length_batch = next(iter(dataloader))
print('input:', text_batch.shape)
print('output:', label_batch)
print('batch size=', length_batch)

print(text_batch)

<class 'torch.utils.data.dataloader.DataLoader'>


ValueError: expected sequence of length 17 at dim 1 (got 19)

In [28]:
## Step 4: batching the datasets. shuffle data for each epoch
batch_size = 32

train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [14]:
class RNN(nn.Module):
    def __init__(self, input_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, embed_dim, padding_idx=0) 
        # model: long-short term memory
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        # Packs a Tensor containing padded sequences of variable length.
        lengths = lengths.cpu().numpy()
        out = nn.utils.rnn.pack_padded_sequence(out, lengths, batch_first=True, enforce_sorted=False)
        # 
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out
         
vocab_size = len(input_vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(device)
    

In [15]:
def train(dataloader, loss_fn, optimizer):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader, loss_fn, optimizer):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [16]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 

torch.manual_seed(1)
 
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl, loss_fn, optimizer)
    acc_valid, loss_valid = evaluate(valid_dl, loss_fn, optimizer)
    print(f'Epoch {epoch} accuracy: {acc_train:.4f} val_accuracy: {acc_valid:.4f}')
 

ValueError: Using a target size (torch.Size([32, 2])) that is different to the input size (torch.Size([32])) is deprecated. Please ensure they have the same size.

In [None]:

# from torch.utils.data import Dataset

# class MirnaDataset(Dataset):
#     def __init__(self, data, labels):
#         self.data = data
#         self.labels = labels
    
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, idx):
#         sample = self.data[idx]
#         label = self.labels[idx]
#         return sample, label
# mirna_dataset = MirnaDataset(train_data, labels)
# print(mirna_dataset)