In [318]:
import torch
import torch.nn as nn
from torch.optim import SGD, Adam
import numpy as np

# Упражнение, для реализации "Ванильной" RNN
* Попробуем обучить сеть восстанавливать слово hello по первой букве. т.е. построим charecter-level модель

In [319]:
a = torch.ones((3,3))*3
b = torch.ones((3,3))*5

In [320]:
a @ b

tensor([[45., 45., 45.],
        [45., 45., 45.],
        [45., 45., 45.]])

In [321]:
a * b

tensor([[15., 15., 15.],
        [15., 15., 15.],
        [15., 15., 15.]])

In [322]:
# word = 'ololoasdasddqweqw123456789'
word = 'hello'

## Датасет. 
Позволяет:
* Закодировать символ при помощи one-hot
* Делать итератор по слову, которыей возвращает текущий символ и следующий как таргет

In [323]:
class WordDataSet:
    
    def __init__(self, word):
        self.chars2idx = {}
        self.indexs  = []
        for c in word: 
            if c not in self.chars2idx:
                self.chars2idx[c] = len(self.chars2idx)
                
            self.indexs.append(self.chars2idx[c])
            
        self.vec_size = len(self.chars2idx)
        self.seq_len  = len(word)
        
    def get_one_hot(self, idx):
        x = torch.zeros(self.vec_size)
        x[idx] = 1
        return x
    
    def __iter__(self):
        return zip(self.indexs[:-1], self.indexs[1:])
    
    def __len__(self):
        return self.seq_len
    
    def get_char_by_id(self, id):
        for c, i in self.chars2idx.items():
            if id == i: return c
        return None

## Реализация базовой RNN
<br/>
Скрытый элемент
$$ h_t= tanh⁡ (W_{ℎℎ} h_{t−1}+W_{xh} x_t) $$
Выход сети

$$ y_t = W_{hy} h_t $$

In [324]:
class VanillaRNN(nn.Module):
    
    def __init__(self, in_size=5, hidden_size=3, out_size=5):
        super(VanillaRNN, self).__init__()        
        self.x2hidden    = nn.Linear(in_features=in_size, out_features=hidden_size)
        self.hidden      = nn.Linear(in_features=hidden_size, out_features=hidden_size)
        self.activation  = nn.Tanh()
        self.outweight   = nn.Linear(in_features=hidden_size, out_features=out_size)
    
    def forward(self, x, prev_hidden):
        hidden = self.activation(self.x2hidden(x) + self.hidden(prev_hidden))
#         Версия без активации - может происходить gradient exploding
#         hidden = self.x2hidden(x) + self.hidden(prev_hidden)
        output = self.outweight(hidden)
        return output, hidden

## Инициализация переменных 

In [325]:
ds = WordDataSet(word=word)
rnn = VanillaRNN(in_size=ds.vec_size, hidden_size=3, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 100
optim     = SGD(rnn.parameters(), lr = 0.1, momentum=0.9)

# Обучение

In [326]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(rnn.hidden.in_features)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = rnn(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1)
            
#     print("Params : ")
#     num_params = 0
#     for item in rnn.parameters():
#         num_params += 1
#         print(item.grad)
#     print("NumParams :", num_params)
#     print("Optimize")
    
    optim.step()

6.03316068649292
Clip gradient :  2.626285924888852
2.3404970169067383
Clip gradient :  0.7095759849218488
1.9489357471466064
Clip gradient :  0.5791866418988671
1.83073091506958
Clip gradient :  0.35723572477479315
1.4334876537322998
Clip gradient :  0.28853635514587117
1.1885817050933838
Clip gradient :  3.077614004052836
0.6803288459777832
Clip gradient :  3.465572836200221
0.3744478225708008
Clip gradient :  0.33284903186139936
0.294940710067749
Clip gradient :  1.5264993843834198
0.2093346118927002
Clip gradient :  0.6694714282480253


# Тестирование

In [327]:
rnn.eval()
hh = torch.zeros(rnn.hidden.in_features)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = rnn(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 hello
Original:	 hello


# ДЗ
Реализовать LSTM и GRU модули, обучить их предсказывать тестовое слово
Сохранить ноутбук с предсказанием и пройденным assert и прислать на почту a.murashev@corp.mail.ru
c темой:


[МФТИ\_2019\_1] ДЗ №8 ФИО

In [356]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

## Реализовать LSTM

In [357]:
#Написать реализацию LSTM и обучить предсказывать слово
# Peephole LSTM implementation
class LSTM(nn.Module):
    def __init__(self, in_size=5, out_size=5, bias=True):
        super(LSTM, self).__init__()   
  
        self.out_size = out_size
        self.in_size = in_size
        self.hidden_size = in_size
        
        self.sigm_activation_f = nn.Sigmoid()
        self.sigm_activation_i = nn.Sigmoid()
        self.sigm_activation_o = nn.Sigmoid()
        
        self.tanh_activation_c = nn.Tanh()
        self.tanh_activation_h = nn.Tanh()
        
        self.f_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.f_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias )
        
        self.i_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.i_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        
        self.o_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.o_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        
        self.c_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size)
        
        self.out = nn.Linear(in_features=self.hidden_size, out_features=out_size)
        
    def forward(self, x, prev_c):
        f = self.sigm_activation_f(self.f_x2h(x) + self.f_hidden(prev_c))
        i = self.sigm_activation_i(self.i_x2h(x) + self.i_hidden(prev_c))
        o = self.sigm_activation_o(self.o_x2h(x) + self.o_hidden(prev_c))
        
        c = f * prev_c + i * self.tanh_activation_c(self.c_x2h(x))
        h = o * c
        y = self.out(h)
        
        return y, c
        

In [358]:
torch.manual_seed(42)
ds = WordDataSet(word=word)
lstm = LSTM(in_size=ds.vec_size, out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = Adam(lstm.parameters(), lr = 0.1)

In [359]:
CLIP_GRAD = True
for epoch in range(e_cnt):
    cc = torch.zeros(lstm.in_size)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, cc = lstm(x, cc)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(lstm.parameters(), max_norm=1)
            
    optim.step()

70.8570785522461
Clip gradient :  3.0038536452630624
7.854757785797119
Clip gradient :  11.249658075653167
2.4592647552490234
Clip gradient :  7.6202734546458295
0.10059642791748047
Clip gradient :  0.2255499947763327
0.014684677124023438
Clip gradient :  0.07922575121566285
0.005322456359863281
Clip gradient :  0.015420438977321628
0.0023984909057617188
Clip gradient :  0.005125523382784418
0.0016279220581054688
Clip gradient :  0.004057266860241546
0.0012645721435546875
Clip gradient :  0.0020207037678093864
0.0010967254638671875
Clip gradient :  0.001753066475971767
0.0009679794311523438
Clip gradient :  0.0013396732972110307
0.0008792877197265625
Clip gradient :  0.0012730166181135157
0.000804901123046875
Clip gradient :  0.0011036007590247
0.0007429122924804688
Clip gradient :  0.0009928215483807153
0.0006971359252929688
Clip gradient :  0.0009316020256877275
0.00064849853515625
Clip gradient :  0.0008776979038272071
0.0006113052368164062
Clip gradient :  0.0008236966150692002
0.0

In [360]:
lstm.eval()
cc = torch.zeros(lstm.in_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, cc = lstm(x, cc)
    
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789


## Реализовать GRU

In [333]:
#тестовое слово
word = 'ololoasdasddqweqw123456789'

In [334]:
#Написать реализацию GRU и обучить предсказывать слово
class GRU(nn.Module):
    def __init__(self, in_size=5,  out_size=5, bias=True):
        super(GRU, self).__init__()  
        
        self.hidden_size = in_size
        self.in_size = in_size
        self.out_size = out_size
        
        self.sigm_z = nn.Sigmoid()
        self.sigm_r = nn.Sigmoid()
        self.tan_h = nn.Tanh()
        
        self.z_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.z_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        
        self.r_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.r_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        
        self.h_x2h = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        self.h_hidden = nn.Linear(in_features=in_size, out_features=self.hidden_size, bias=bias)
        
        self.out = nn.Linear(in_features=self.hidden_size, out_features=out_size)
        
    def forward(self, x, prev_h):
        z = self.sigm_z(self.z_x2h(x) + self.z_hidden(prev_h))
        r = self.sigm_r(self.r_x2h(x) + self.r_hidden(prev_h))

        h = z * prev_h + (1. - z) * self.tan_h(self.h_x2h(x) + self.h_hidden(r * prev_h))
        y = self.out(h)
        
        return y, h

In [335]:
torch.manual_seed(42)
ds = WordDataSet(word=word)
gru = GRU(in_size=ds.vec_size,  out_size=ds.vec_size)
criterion = nn.CrossEntropyLoss()
e_cnt     = 500
optim     = SGD(gru.parameters(), lr = 0.1, momentum=0.9)

In [336]:
CLIP_GRAD = True

for epoch in range(e_cnt):
    hh = torch.zeros(gru.in_size).unsqueeze(0)
    loss = 0
    optim.zero_grad()
    for sample, next_sample in ds:
        x = ds.get_one_hot(sample).unsqueeze(0)
        target =  torch.LongTensor([next_sample])

        y, hh = gru(x, hh)
        
        loss += criterion(y, target)
     

    loss.backward()
    
    if epoch % 10 == 0:
        print (loss.data.item())
        if CLIP_GRAD: print("Clip gradient : ", torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=5))
    else: 
        if CLIP_GRAD: torch.nn.utils.clip_grad_norm_(gru.parameters(), max_norm=1)
            

    optim.step()

71.14236450195312
Clip gradient :  5.055303000771596
56.5948371887207
Clip gradient :  9.271184235423982
28.55231475830078
Clip gradient :  11.662204908206084
14.145013809204102
Clip gradient :  9.473009859417527
6.945394039154053
Clip gradient :  6.867463626457366
2.210543632507324
Clip gradient :  3.2971812540620915
0.3948345184326172
Clip gradient :  0.7976592332775361
0.10955142974853516
Clip gradient :  0.10481674712130083
0.052211761474609375
Clip gradient :  0.06145053952224002
0.032196044921875
Clip gradient :  0.025888050208740264
0.02506542205810547
Clip gradient :  0.020045744843670797
0.021053314208984375
Clip gradient :  0.014902723855270165
0.01864147186279297
Clip gradient :  0.012741841281383092
0.016928672790527344
Clip gradient :  0.01155601729181755
0.015563011169433594
Clip gradient :  0.01054119207097949
0.014435768127441406
Clip gradient :  0.009750464301017461
0.013478279113769531
Clip gradient :  0.009100215227387125
0.012643814086914062
Clip gradient :  0.00853

In [337]:
gru.eval()
hh = torch.zeros(gru.in_size)
id = 0
softmax  = nn.Softmax(dim=1)
predword = ds.get_char_by_id(id)
for c in enumerate(word[:-1]):
    x = ds.get_one_hot(id).unsqueeze(0)
    y, hh = gru(x, hh)
    y = softmax(y)
    m, id = torch.max(y, 1)
    id = id.data[0]
    predword += ds.get_char_by_id(id)
print ('Prediction:\t' , predword)
print("Original:\t", word)
assert(predword == word)

Prediction:	 ololoasdasddqweqw123456789
Original:	 ololoasdasddqweqw123456789
