In [25]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

torch.manual_seed(1)

# reload all changed moduels
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
from my_data import MyData
from my_collate import MyCollate
from my_embedding import MyEmbedding
from my_process import MyProcess
from my_parser import MyParser
from model_bert import ModelBert

In [27]:
# prepare dataset
md = MyData()
train_data = md.get_full_data()
print('Total number of data:', len(train_data))
print('Observe data:', train_data[0])

Number of origin seq: 48882
Number of trim seq: 683174
Number of shuffle seq: 48882
Number of 5srrnadb.fasta: 11415
Number of gtrnadb.fasta: 236835
Number of pirbase.fasta: 219278
Total number of data: 1146682
Observe data: ('Caenorhabditis elegans', 'TGAGGTAGTAGGTTGTATAGTT')


In [28]:
me = MyEmbedding(train_data)
train_dataset, valid_dataset = me.split()

train_texts = [i[1] for i in train_dataset]
train_labels = [0 if i[0] in ('shuffle', 'random', 'other') else 1 for i in train_dataset]

valid_texts = [i[1] for i in valid_dataset]
valid_labels = [0 if i[0] in ('shuffle', 'random', 'other') else 1 for i in valid_dataset]

print(train_labels[0], valid_labels[0])

('other', 'TNGGCAGCGTGGTTCCTGTTGGTGAGCTCT')
917346 <class 'torch.utils.data.dataset.Subset'>
0 1


In [29]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [30]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)


In [31]:
train_encodings[0]

Encoding(num_tokens=57, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [32]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = MyDataset(train_encodings, train_labels)
valid_dataset = MyDataset(valid_encodings, valid_labels)

In [33]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False)

In [34]:
next(iter(train_loader))

{'input_ids': tensor([[  101,  1056, 13512,  6593, 18195,  9468, 18195,  4017, 18195,  6593,
          18195,  2278,   102,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101, 22975, 19629, 13512,  9468,  2102, 18195,  5946, 13871,  2102,
           3654,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0],
         [  101, 11937, 11057,  8490,  3540,  8490,  3540, 11057,  

In [35]:
from transformers import DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
def compute_accuracy(model, data_loader, device):
    with torch.no_grad():
        correct_pred, num_examples = 0, 0
        
        for batch_idx, batch in enumerate(data_loader):
        
        ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs['logits']
            predicted_labels = torch.argmax(logits, 1)
            num_examples += labels.size(0)
            correct_pred += (predicted_labels == labels).sum()
        
        return correct_pred.float()/num_examples * 100

In [38]:
import time
start_time = time.time()

torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 3


for epoch in range(NUM_EPOCHS):

    model.train()
    
    for batch_idx, batch in enumerate(train_loader):
        
        ### Prepare data
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        ### Forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss, logits = outputs['loss'], outputs['logits']
        
        ### Backward
        optim.zero_grad()
        loss.backward()
        optim.step()
        
        ### Logging
        if not batch_idx % 250:
            print (f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
                   f'Batch {batch_idx:04d}/{len(train_loader):04d} | '
                   f'Loss: {loss:.4f}')
            
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'Training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nValid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
# print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 0001/0003 | Batch 0000/114669 | Loss: 0.6973
Epoch: 0001/0003 | Batch 0250/114669 | Loss: 0.0566
Epoch: 0001/0003 | Batch 0500/114669 | Loss: 0.7737
Epoch: 0001/0003 | Batch 0750/114669 | Loss: 0.0361
Epoch: 0001/0003 | Batch 1000/114669 | Loss: 0.0278
Epoch: 0001/0003 | Batch 1250/114669 | Loss: 0.0405
Epoch: 0001/0003 | Batch 1500/114669 | Loss: 0.3690
Epoch: 0001/0003 | Batch 1750/114669 | Loss: 0.0416
Epoch: 0001/0003 | Batch 2000/114669 | Loss: 0.0383
Epoch: 0001/0003 | Batch 2250/114669 | Loss: 0.0205
Epoch: 0001/0003 | Batch 2500/114669 | Loss: 0.0491
Epoch: 0001/0003 | Batch 2750/114669 | Loss: 0.4311
Epoch: 0001/0003 | Batch 3000/114669 | Loss: 0.0934
Epoch: 0001/0003 | Batch 3250/114669 | Loss: 0.3605
Epoch: 0001/0003 | Batch 3500/114669 | Loss: 0.6571
Epoch: 0001/0003 | Batch 3750/114669 | Loss: 0.0362
Epoch: 0001/0003 | Batch 4000/114669 | Loss: 0.0430
Epoch: 0001/0003 | Batch 4250/114669 | Loss: 0.0526
Epoch: 0001/0003 | Batch 4500/114669 | Loss: 0.8403
Epoch: 0001/

NameError: name 'test_loader' is not defined