In [50]:
import torch
import torch.nn as nn
from torch.optim import SGD 
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [73]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [74]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [75]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [76]:
#word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [77]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [62]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [70]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [71]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

73.06047058105469
Clip gradient :  7.437175003988109
58.754878997802734
Clip gradient :  6.7159120324765595
45.875450134277344
Clip gradient :  8.084153846317244
42.543846130371094
Clip gradient :  21.162578326745187
31.38055992126465
Clip gradient :  5.894581662237843
25.658594131469727
Clip gradient :  5.735804175426364
22.810142517089844
Clip gradient :  5.3126005054326315
20.45781898498535
Clip gradient :  6.864212647914329
17.558513641357422
Clip gradient :  8.02025901609971
17.57586669921875
Clip gradient :  9.743617080932442


# Тестирование

In [72]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 oasdasdasdasdasdasdasdasda
Original:	 ololoasdasddqweqw123456789


AssertionError: 

# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [180]:
#тестовое слово
#word = 'ololoasdasddqweqw123456789'
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [181]:
class LSTM(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(LSTM, self).__init__()  
        self.x2hidden_input = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden_input = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2hidden_forget = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden_forget = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2hidden_output = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden_output = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2cand_cell_state = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.cand_cell_state = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        

        self.sigmoid = nn.Sigmoid()
        self.activation  = nn.Tanh()
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        
        candidate = self.activation(self.x2cand_cell_state(x) + self.cand_cell_state(prev_hidden))
        input_gate = self.sigmoid(self.x2hidden_output(x) + self.hidden_input(prev_hidden))
        forget_gate = self.sigmoid(self.x2hidden_forget(x) + self.hidden_forget(prev_hidden))
        output_gate = self.sigmoid(self.x2hidden_output(x) + self.hidden_output(prev_hidden))
        
        cell_state = forget_gate*prev_hidden + input_gate*candidate
        hidden = output_gate*self.activation(cell_state)
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

In [188]:
ds = WordDataSet(word=word)
rnn = LSTM(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 800
optim     = SGD(rnn.parameters(), lr = 0.0035, momentum=0.9)

In [189]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden_input.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

72.8020248413086
Clip gradient :  3.5050267784786002
72.12062072753906
Clip gradient :  3.243286563539369
71.06831359863281
Clip gradient :  2.7817593259763482
70.08607482910156
Clip gradient :  2.2690146762808205
69.29735565185547
Clip gradient :  1.7985476193503322
68.69398498535156
Clip gradient :  1.4677829324651221
68.20228576660156
Clip gradient :  1.373507118804837
67.71556091308594
Clip gradient :  1.524025675979223
67.138427734375
Clip gradient :  1.8150381609253683
66.42091369628906
Clip gradient :  2.1356243233334915
65.55620574951172
Clip gradient :  2.422937258518237
64.56068420410156
Clip gradient :  2.658914121781265
63.455894470214844
Clip gradient :  2.8499409435834053
62.25980758666992
Clip gradient :  3.0069306059569136
60.987300872802734
Clip gradient :  3.1309170176923415
59.654930114746094
Clip gradient :  3.218194544090391
58.2811393737793
Clip gradient :  3.27142256625065
56.88232421875
Clip gradient :  3.300041300229083
55.4701042175293
Clip gradient :  3.31363

In [190]:
rnn.eval()
hh = torch.zeros(rnn.hidden_input.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


In [306]:
ds = WordDataSet(word=word)
rnn = LSTM(in_size=ds.vec_size, hidden_size=13, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = SGD(rnn.parameters(), lr = 0.0035, momentum=0.9)

In [307]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden_input.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

71.98316192626953
Clip gradient :  3.727144909833264
71.2453384399414
Clip gradient :  3.403254159518163
70.13980102539062
Clip gradient :  2.842741248359161
69.15157318115234
Clip gradient :  2.2304626126031395
68.40304565429688
Clip gradient :  1.7081155563420243
67.8493423461914
Clip gradient :  1.4599906199666144
67.35659790039062
Clip gradient :  1.510708692693682
66.80661010742188
Clip gradient :  1.7129873487323994
66.14325714111328
Clip gradient :  1.9958630798225947
65.33881378173828
Clip gradient :  2.3270818854257675
64.37190246582031
Clip gradient :  2.7023517440101648
63.21278762817383
Clip gradient :  3.125429528645227
61.82717514038086
Clip gradient :  3.578986768352163
60.19051742553711
Clip gradient :  4.0361508741756875
58.298744201660156
Clip gradient :  4.442337809950821
56.182865142822266
Clip gradient :  4.747804037796753
53.897342681884766
Clip gradient :  4.966897478544474
51.4775505065918
Clip gradient :  5.163303438324523
48.949684143066406
Clip gradient :  5.

In [308]:
rnn.eval()
hh = torch.zeros(rnn.hidden_input.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [290]:
#Написать реализацию GRU и обучить предсказывать слово
class GRU(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(GRU, self).__init__()  
        self.x2update = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.update = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2reset = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.reset = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        
        self.x2hidden_out = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden_out = nn.Linear(in_features=hidden_size, out_features=hidden_size)   

        self.sigmoid = nn.Sigmoid()
        self.activation  = nn.Tanh()
        
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        
        update_gate = self.sigmoid(self.x2update(x) + self.update(prev_hidden))
        reset_gate = self.sigmoid(self.x2reset(x) + self.reset(prev_hidden))
        hidden_output = self.activation(self.x2hidden_out(x) + self.hidden_out(reset_gate*prev_hidden))

        hidden = (1-update_gate)*hidden_output + update_gate*prev_hidden
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

In [291]:
ds = WordDataSet(word=word)
rnn = GRU(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 2000
optim     = SGD(rnn.parameters(), lr = 0.055, momentum=0.8)

In [292]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden_out.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

71.87786102294922
Clip gradient :  5.188306800588145
66.8559799194336
Clip gradient :  2.3696214817023638
59.3044319152832
Clip gradient :  3.9175002259330007
45.96525955200195
Clip gradient :  3.9976920765850466
34.796142578125
Clip gradient :  2.6504255873884772
28.036130905151367
Clip gradient :  1.815157995049056
23.365081787109375
Clip gradient :  1.479896514426469
19.481264114379883
Clip gradient :  1.3148412708741959
16.372228622436523
Clip gradient :  1.249680026927593
14.247832298278809
Clip gradient :  1.859410242095364
13.171234130859375
Clip gradient :  6.359223858177733
13.93499755859375
Clip gradient :  14.66719131366566
13.289627075195312
Clip gradient :  11.064466598866426
13.131752014160156
Clip gradient :  10.336749869044029
12.711416244506836
Clip gradient :  8.969250076095255
12.726506233215332
Clip gradient :  10.8622171662519
12.077552795410156
Clip gradient :  7.379678292885951
11.842845916748047
Clip gradient :  5.032360325973545
11.098884582519531
Clip gradient

1.8747239112854004
Clip gradient :  0.837718973237058
1.8741135597229004
Clip gradient :  1.5661424772895927
1.860508918762207
Clip gradient :  0.9843192916705618
1.8396868705749512
Clip gradient :  0.9062164748161158
1.8240880966186523
Clip gradient :  0.3383635385555431
1.8148870468139648
Clip gradient :  0.5255805001039033
1.9219202995300293
Clip gradient :  4.917594403450506
15.296342849731445
Clip gradient :  16.37532452107007
11.35098648071289
Clip gradient :  22.604804377127415
6.064920425415039
Clip gradient :  26.177884248183464
2.2441534996032715
Clip gradient :  4.931797524986466
2.0749826431274414
Clip gradient :  2.772492253330128
1.91550874710083
Clip gradient :  1.1039791231374494
1.8414382934570312
Clip gradient :  0.3983959595148351
1.8157544136047363
Clip gradient :  0.11649638560944876
1.8000574111938477
Clip gradient :  0.0784632101285742
1.7878165245056152
Clip gradient :  0.06295103557053104
1.7774934768676758
Clip gradient :  0.058432695912305405
1.76842975616455

In [293]:
rnn.eval()
hh = torch.zeros(rnn.hidden_out.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


In [300]:
ds = WordDataSet(word=word)
rnn = GRU(in_size=ds.vec_size, hidden_size=13, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 200
optim     = SGD(rnn.parameters(), lr = 0.055, momentum=0.8)

In [301]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden_out.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

71.39786529541016
Clip gradient :  4.987064269980409
64.61773681640625
Clip gradient :  3.62434131760271
50.63269805908203
Clip gradient :  6.13333817603911
31.830459594726562
Clip gradient :  4.831189811514762
18.17308807373047
Clip gradient :  3.6051484314701923
8.588116645812988
Clip gradient :  2.425961084059931
3.3659610748291016
Clip gradient :  1.2547837448155956
1.1851539611816406
Clip gradient :  0.5287296474615661
0.6112232208251953
Clip gradient :  0.2834386216981405
0.4169178009033203
Clip gradient :  0.1969429686538978
0.3203458786010742
Clip gradient :  0.15348125933504336
0.26098155975341797
Clip gradient :  0.12651812753757427
0.22021865844726562
Clip gradient :  0.10784004374298634
0.19034099578857422
Clip gradient :  0.09402846249622998
0.1674633026123047
Clip gradient :  0.08335494893390355
0.14938068389892578
Clip gradient :  0.07484124154386727
0.1347370147705078
Clip gradient :  0.06788754412814359
0.1226348876953125
Clip gradient :  0.06209801719526665
0.11247539

In [302]:
rnn.eval()
hh = torch.zeros(rnn.hidden_out.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
