In [5]:
import datetime

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
data_dir = 'drive/My Drive/'
train_lang = 'en'

In [8]:
class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {'<pad>': 0} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {'<pad>': 0} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {'<pad>': 0} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], # [1, 2, 3, 4, 6] len=5
        }

In [9]:
dataset = DatasetSeq(data_dir)

In [10]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}

In [11]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 256
hidden = 256
n_epochs = 10
batch_size = 64
cuda_device = 0
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

GRU

In [12]:
class GRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
    
    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb) # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [13]:
model = GRU(vocab_size, emb_dim, hidden, n_classes).to(device)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [14]:
start = datetime.datetime.now()

for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    GRU_train_time = datetime.datetime.now() - start
   
    torch.save(model.state_dict(), f'./rnn_chkpt_{epoch}.pth')
print (f'Время тренировки GRU = {GRU_train_time}')

epoch: 0, step: 0, loss: 3.101123809814453
epoch: 0, step: 100, loss: 0.12137915194034576
epoch: 0, step: 200, loss: 0.1846797913312912
epoch: 1, step: 0, loss: 0.1391931176185608
epoch: 1, step: 100, loss: 0.12494346499443054
epoch: 1, step: 200, loss: 0.14689116179943085
epoch: 2, step: 0, loss: 0.12273569405078888
epoch: 2, step: 100, loss: 0.13538816571235657
epoch: 2, step: 200, loss: 0.13304944336414337
epoch: 3, step: 0, loss: 0.08025465905666351
epoch: 3, step: 100, loss: 0.11756011098623276
epoch: 3, step: 200, loss: 0.08388648182153702
epoch: 4, step: 0, loss: 0.03305644914507866
epoch: 4, step: 100, loss: 0.06252439320087433
epoch: 4, step: 200, loss: 0.06028808280825615
epoch: 5, step: 0, loss: 0.05325816944241524
epoch: 5, step: 100, loss: 0.04236844927072525
epoch: 5, step: 200, loss: 0.056960202753543854
epoch: 6, step: 0, loss: 0.05091463401913643
epoch: 6, step: 100, loss: 0.04334740340709686
epoch: 6, step: 200, loss: 0.03664012253284454
epoch: 7, step: 0, loss: 0.039

In [29]:
phrase = 'I am doing my homework right now'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    GRU_inference_time = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print (f'Время инференса GRU = {GRU_inference_time}')

['PRON', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADV', 'ADV']
Время инференса GRU = 0:00:00.002574


LSTM

In [25]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
    
    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb) # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [26]:
model = LSTM(vocab_size, emb_dim, hidden, n_classes).to(device)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [27]:
start = datetime.datetime.now()

for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    LSTM_train_time = datetime.datetime.now() - start
   
    torch.save(model.state_dict(), f'./rnn_chkpt_{epoch}.pth')
print (f'Время тренировки LSTM = {LSTM_train_time}')

epoch: 0, step: 0, loss: 2.9216668605804443
epoch: 0, step: 100, loss: 0.26620611548423767
epoch: 0, step: 200, loss: 0.24024461209774017
epoch: 1, step: 0, loss: 0.19296017289161682
epoch: 1, step: 100, loss: 0.17230208218097687
epoch: 1, step: 200, loss: 0.0988345742225647
epoch: 2, step: 0, loss: 0.20711670815944672
epoch: 2, step: 100, loss: 0.14550676941871643
epoch: 2, step: 200, loss: 0.10573887079954147
epoch: 3, step: 0, loss: 0.09940575063228607
epoch: 3, step: 100, loss: 0.09124055504798889
epoch: 3, step: 200, loss: 0.0925726443529129
epoch: 4, step: 0, loss: 0.06963233649730682
epoch: 4, step: 100, loss: 0.08488699048757553
epoch: 4, step: 200, loss: 0.05616848170757294
epoch: 5, step: 0, loss: 0.06587868183851242
epoch: 5, step: 100, loss: 0.05797233805060387
epoch: 5, step: 200, loss: 0.05371752753853798
epoch: 6, step: 0, loss: 0.057022180408239365
epoch: 6, step: 100, loss: 0.05939161032438278
epoch: 6, step: 200, loss: 0.04490207880735397
epoch: 7, step: 0, loss: 0.03

In [28]:
phrase = 'I am doing my homework right now'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    LSTM_inference_time = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print (f'Время инференса LSTM = {GRU_inference_time}')

['PRON', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADV', 'ADV']
Время инференса LSTM = 0:00:00.002180


RNN

In [31]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
    
    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb) # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [32]:
model = RNN(vocab_size, emb_dim, hidden, n_classes).to(device)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [33]:
start = datetime.datetime.now()

for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
    RNN_train_time = datetime.datetime.now() - start
   
    torch.save(model.state_dict(), f'./rnn_chkpt_{epoch}.pth')
print (f'Время тренировки LSTM = {RNN_train_time}')

epoch: 0, step: 0, loss: 2.785332441329956
epoch: 0, step: 100, loss: 0.19553081691265106
epoch: 0, step: 200, loss: 0.1667640507221222
epoch: 1, step: 0, loss: 0.1786234974861145
epoch: 1, step: 100, loss: 0.11910043656826019
epoch: 1, step: 200, loss: 0.14456583559513092
epoch: 2, step: 0, loss: 0.115811787545681
epoch: 2, step: 100, loss: 0.1400361955165863
epoch: 2, step: 200, loss: 0.12958604097366333
epoch: 3, step: 0, loss: 0.09612379968166351
epoch: 3, step: 100, loss: 0.06782341003417969
epoch: 3, step: 200, loss: 0.11805868148803711
epoch: 4, step: 0, loss: 0.07453262060880661
epoch: 4, step: 100, loss: 0.08563405275344849
epoch: 4, step: 200, loss: 0.06428519636392593
epoch: 5, step: 0, loss: 0.06816691905260086
epoch: 5, step: 100, loss: 0.09515038132667542
epoch: 5, step: 200, loss: 0.07228513062000275
epoch: 6, step: 0, loss: 0.04631855711340904
epoch: 6, step: 100, loss: 0.043122440576553345
epoch: 6, step: 200, loss: 0.06853370368480682
epoch: 7, step: 0, loss: 0.036460

In [34]:
phrase = 'I am doing my homework right now'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start = datetime.datetime.now()
with torch.no_grad():
    model.eval()
    predict = model(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    RNN_inference_time = datetime.datetime.now() - start

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print (f'Время инференса RNN = {RNN_inference_time}')

['PRON', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADV', 'ADV']
Время инференса RNN = 0:00:00.002222


RNN with char input

In [17]:
def collate_fn(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {
        'data': data, # B x T
        'chars': chars_seq, # List[tensor];   tensor B x word_len; len(chars_seq) = n_words =  T
        'target': targets}

In [18]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.char_emb(x)
        _, out = self.rnn(emb) # 1 x B x Hid

        return out.squeeze().unsqueeze(1) # B x 1 x Hid

In [19]:
class CharGRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes, 
                 char_vocab, char_emb, char_hidden):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim + char_hidden, hidden_dim, batch_first=True)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.char_rnn = CharRNN(char_vocab, char_emb, char_hidden)
    
    def forward(self, x, chars):
        emb = self.word_emb(x) # B x T x Emb_dim
        char_feat = [self.char_rnn(c.to(x.device)) for c in chars] 
        char_feat = torch.cat(char_feat, dim=1) # B x T x Hid_char
        emb = torch.cat((emb, char_feat), dim=-1)
        hidden, _ = self.rnn(emb) # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [20]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 256
hidden = 256
char_hid = 64
char_emb = 32
n_epochs = 10
batch_size = 64
cuda_device = 0
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

In [21]:
model = CharGRU(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device)
model.train()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [23]:
start = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model(batch['data'].to(device), batch['chars'])
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')

    CharGRU_train_time = datetime.datetime.now() - start
   
    torch.save(model.state_dict(), f'./rnn_chkpt_{epoch}.pth')
print (f'Время тренировки GRU на буквах = {CharGRU_train_time}' )

epoch: 0, step: 0, loss: 0.10503656417131424
epoch: 0, step: 100, loss: 0.07406461983919144
epoch: 0, step: 200, loss: 0.07979340106248856
epoch: 1, step: 0, loss: 0.05045900121331215
epoch: 1, step: 100, loss: 0.07383377104997635
epoch: 1, step: 200, loss: 0.043683238327503204
epoch: 2, step: 0, loss: 0.06219891086220741
epoch: 2, step: 100, loss: 0.06995214521884918
epoch: 2, step: 200, loss: 0.09226678311824799
epoch: 3, step: 0, loss: 0.04853523150086403
epoch: 3, step: 100, loss: 0.041543953120708466
epoch: 3, step: 200, loss: 0.046959348022937775
epoch: 4, step: 0, loss: 0.01868255063891411
epoch: 4, step: 100, loss: 0.038568202406167984
epoch: 4, step: 200, loss: 0.04018765687942505
epoch: 5, step: 0, loss: 0.030659547075629234
epoch: 5, step: 100, loss: 0.04228643327951431
epoch: 5, step: 200, loss: 0.019917916506528854
epoch: 6, step: 0, loss: 0.0328444167971611
epoch: 6, step: 100, loss: 0.03597190976142883
epoch: 6, step: 200, loss: 0.04313971474766731
epoch: 7, step: 0, los