## SimpleRNN, LSTM in Pytorch

RNN

In [7]:
import numpy as np
import torch.nn as nn
import torch

In [3]:
train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_X = np.array(train_X, dtype=np.float32)
print(train_X.shape)

(1, 4, 5)


In [14]:
rnn = nn.RNN(input_size=5, hidden_size=3, batch_first=True)
hidden_state = rnn(torch.Tensor(train_X))

In [16]:
for x in hidden_state:
    print(x.shape)

torch.Size([1, 4, 3])
torch.Size([1, 1, 3])


In [17]:
hidden_state

(tensor([[[-0.6037,  0.3148,  0.9215],
          [-0.2451,  0.4842,  0.1278],
          [-0.4799, -0.7831,  0.0037],
          [ 0.1668, -0.8375,  0.9610]]], grad_fn=<TransposeBackward1>),
 tensor([[[ 0.1668, -0.8375,  0.9610]]], grad_fn=<StackBackward>))

LSTM

In [22]:
lstm = nn.LSTM(input_size=5, hidden_size=3, batch_first=True, bidirectional=True)
output, (h_n, c_n) = lstm(torch.Tensor(train_X))

In [23]:
print(output)
print(h_n)
print(c_n)

tensor([[[ 0.4927, -0.5936, -0.1566, -0.0501,  0.3646,  0.2888],
         [ 0.6602, -0.6393, -0.3721, -0.1072,  0.0677,  0.1797],
         [ 0.4946, -0.6110, -0.4594, -0.0567,  0.1437,  0.2343],
         [ 0.6278, -0.3790, -0.4976, -0.0691, -0.0278,  0.3694]]],
       grad_fn=<TransposeBackward0>)
tensor([[[ 0.6278, -0.3790, -0.4976]],

        [[-0.0501,  0.3646,  0.2888]]], grad_fn=<StackBackward>)
tensor([[[ 0.9288, -0.6268, -1.2566]],

        [[-1.0342,  0.9499,  0.6844]]], grad_fn=<StackBackward>)


## RNN을 이용한 텍스트 생성

### 데이터 전처리

In [98]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [99]:
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [100]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1

In [101]:
tokenizer.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [102]:
sequences = list()
for line in text.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print(f'학습에 사용할 샘플의 개수: {len(sequences)}')

학습에 사용할 샘플의 개수: 11


In [103]:
sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 1],
 [6, 1, 7],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

In [104]:
#전체 백터의 크기를 맞춰주는 작업 진행
max_len = max([len(x) for x in sequences])

sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
sequences

array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]], dtype=int32)

In [105]:
# [:-1] 까지는 입력, [-1] 은 라벨링 값이 된다.
X = sequences[:, :-1]
y = sequences[:, -1]
print(X), print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 3  1  4  5  1  7  1  9 10  1 11]


(None, None)

In [106]:
# labeling 값을 one-hot encoding으로 바꿔주는 작업
y = to_categorical(y, num_classes = vocab_size)
y, y.shape

(array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32),
 (11, 12))

### 모델 설계

In [207]:
import torch.nn as nn
import torch

class text_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(text_RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, 
                          hidden_size=hidden_dim, 
                          batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, inp):
        output = self.embedding(inp)
        _, output = self.rnn(torch.unsqueeze(output, 0))
#         output = torch.squeeze(self.linear(output))
        output = self.linear(output)
#         output = self.softmax(output)
         
        return output

In [208]:
embedding_dim = 10
hidden_dim = 32
lr = 0.001

model = text_RNN(vocab_size=vocab_size,
                 embedding_dim=embedding_dim,
                 hidden_dim=hidden_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [262]:
X, y = torch.Tensor(X), torch.tensor(y)
from sklearn.metrics import accuracy_score

for epoch in range(1, 201):
    train_losses, valid_losses = [], []
    train_accs, valid_accs = [], []

    model.train()
    for inp, label in zip(X, y):
        inp = inp.long()
        label = label.long()
        optimizer.zero_grad()
        output = model(inp)
        label_unsqueeze = torch.unsqueeze(label, 0)
#         predicted = torch.max(output.squeeze().data)
        loss = criterion(output[0], torch.max(label_unsqueeze, 1)[1])
    # TODO: criterion 에서 오류 생김.. 해결해야된다.

        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

  X, y = torch.Tensor(X), torch.tensor(y)


In [263]:
train_losses

[0.017601946368813515,
 0.003255189396440983,
 0.019564863294363022,
 0.007057973183691502,
 0.01085318997502327,
 0.015461812727153301,
 0.0036493625957518816,
 0.011755832470953465,
 0.00931016355752945,
 0.007985097356140614,
 0.00857043731957674]