# SimpleRNN, LSTM in Pytorch

RNN

In [7]:
import numpy as np
import torch.nn as nn
import torch

In [3]:
train_X = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_X = np.array(train_X, dtype=np.float32)
print(train_X.shape)

(1, 4, 5)


In [14]:
rnn = nn.RNN(input_size=5, hidden_size=3, batch_first=True)
hidden_state = rnn(torch.Tensor(train_X))

In [16]:
for x in hidden_state:
    print(x.shape)

torch.Size([1, 4, 3])
torch.Size([1, 1, 3])


In [17]:
hidden_state

(tensor([[[-0.6037,  0.3148,  0.9215],
          [-0.2451,  0.4842,  0.1278],
          [-0.4799, -0.7831,  0.0037],
          [ 0.1668, -0.8375,  0.9610]]], grad_fn=<TransposeBackward1>),
 tensor([[[ 0.1668, -0.8375,  0.9610]]], grad_fn=<StackBackward>))

LSTM

In [22]:
lstm = nn.LSTM(input_size=5, hidden_size=3, batch_first=True, bidirectional=True)
output, (h_n, c_n) = lstm(torch.Tensor(train_X))

In [23]:
print(output)
print(h_n)
print(c_n)

tensor([[[ 0.4927, -0.5936, -0.1566, -0.0501,  0.3646,  0.2888],
         [ 0.6602, -0.6393, -0.3721, -0.1072,  0.0677,  0.1797],
         [ 0.4946, -0.6110, -0.4594, -0.0567,  0.1437,  0.2343],
         [ 0.6278, -0.3790, -0.4976, -0.0691, -0.0278,  0.3694]]],
       grad_fn=<TransposeBackward0>)
tensor([[[ 0.6278, -0.3790, -0.4976]],

        [[-0.0501,  0.3646,  0.2888]]], grad_fn=<StackBackward>)
tensor([[[ 0.9288, -0.6268, -1.2566]],

        [[-1.0342,  0.9499,  0.6844]]], grad_fn=<StackBackward>)


# RNN을 이용한 텍스트 생성

* Tensorflow 로 되어있는 코드를 pytorch로 전환

## RNN 모델 이용

### 데이터 전처리

In [445]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [446]:
text = """경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다\n"""

In [447]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1

In [448]:
tokenizer.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [449]:
sequences = list()
for line in text.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print(f'학습에 사용할 샘플의 개수: {len(sequences)}')

학습에 사용할 샘플의 개수: 11


In [450]:
sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 1],
 [6, 1, 7],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

In [451]:
#전체 백터의 크기를 맞춰주는 작업 진행
max_len = max([len(x) for x in sequences])

sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
sequences

array([[ 0,  0,  0,  0,  2,  3],
       [ 0,  0,  0,  2,  3,  1],
       [ 0,  0,  2,  3,  1,  4],
       [ 0,  2,  3,  1,  4,  5],
       [ 0,  0,  0,  0,  6,  1],
       [ 0,  0,  0,  6,  1,  7],
       [ 0,  0,  0,  0,  8,  1],
       [ 0,  0,  0,  8,  1,  9],
       [ 0,  0,  8,  1,  9, 10],
       [ 0,  8,  1,  9, 10,  1],
       [ 8,  1,  9, 10,  1, 11]], dtype=int32)

In [452]:
# [:-1] 까지는 입력, [-1] 은 라벨링 값이 된다.
X = sequences[:, :-1]
y = sequences[:, -1]
print(X), print(y)

[[ 0  0  0  0  2]
 [ 0  0  0  2  3]
 [ 0  0  2  3  1]
 [ 0  2  3  1  4]
 [ 0  0  0  0  6]
 [ 0  0  0  6  1]
 [ 0  0  0  0  8]
 [ 0  0  0  8  1]
 [ 0  0  8  1  9]
 [ 0  8  1  9 10]
 [ 8  1  9 10  1]]
[ 3  1  4  5  1  7  1  9 10  1 11]


(None, None)

In [453]:
# labeling 값을 one-hot encoding으로 바꿔주는 작업
y = to_categorical(y, num_classes = vocab_size)
y, y.shape

(array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32),
 (11, 12))

### 모델 설계

In [461]:
import torch.nn as nn
import torch

class text_RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(text_RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(input_size=embedding_dim, 
                          hidden_size=hidden_dim, 
                          batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, inp):
        output = self.embedding(inp)
        _, output = self.rnn(output)
        output = self.linear(output)
#         output = self.softmax(output) 
# nn.CrossEntropyLoss에 이미 softmax 함수가 포함된거라서, 따로 해줄 필요는 없다
         
        return output

In [462]:
embedding_dim = 10
hidden_dim = 32
lr = 0.01

model = text_RNN(vocab_size=vocab_size,
                 embedding_dim=embedding_dim,
                 hidden_dim=hidden_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [463]:
from sklearn.metrics import accuracy_score

for epoch in range(1, 201):
    train_losses, train_accs = [], []
    
    model.train()
    for inp, label in zip(X, y):
        inp = torch.Tensor([inp]).long()
        label = torch.Tensor(label).long()
        optimizer.zero_grad()
        output = model(inp)
        label_unsqueeze = torch.unsqueeze(label, 0)
        loss = criterion(output[0], torch.max(label_unsqueeze, 1)[1])

        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    print(f"epoch {epoch:.2f} :: {np.average(train_losses)}")

epoch 1.00 :: 2.4310765158046377
epoch 2.00 :: 1.4229992573911494
epoch 3.00 :: 0.8896948004310782
epoch 4.00 :: 0.5604410672729666
epoch 5.00 :: 0.3601190610365434
epoch 6.00 :: 0.23840859735553915
epoch 7.00 :: 0.16520188444040038
epoch 8.00 :: 0.11592548205093904
epoch 9.00 :: 0.08350600945678624
epoch 10.00 :: 0.06320844353599982
epoch 11.00 :: 0.049969095906073395
epoch 12.00 :: 0.04094145853411068
epoch 13.00 :: 0.03445787643167106
epoch 14.00 :: 0.029586328955536537
epoch 15.00 :: 0.02579465245997364
epoch 16.00 :: 0.02276211562143131
epoch 17.00 :: 0.02028502727096731
epoch 18.00 :: 0.018227602693844925
epoch 19.00 :: 0.01649498820982196
epoch 20.00 :: 0.015018802601844072
epoch 21.00 :: 0.0137483260424977
epoch 22.00 :: 0.012645317156883802
epoch 23.00 :: 0.011680192991413853
epoch 24.00 :: 0.010829922362146053
epoch 25.00 :: 0.010076216422021389
epoch 26.00 :: 0.009404390373013237
epoch 27.00 :: 0.008802372708239338
epoch 28.00 :: 0.008260534060272303
epoch 29.00 :: 0.0077708

In [479]:
import torch.nn.functional as F

def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''
    
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        
        result = model(torch.Tensor(encoded).long())
        result = torch.max(F.softmax(result[0], dim=1), 1)[1]
        
        word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(result)]
        
        current_word = current_word + ' ' + word
        
        sentence = sentence + ' ' + word
    
    sentence = init_word + sentence
    return sentence

In [480]:
print(sentence_generation(model, tokenizer, '경마장에', 4))

경마장에 있는 말이 뛰고 있다


In [481]:
print(sentence_generation(model, tokenizer, '그의', 2))

그의 말이 법이다


In [483]:
print(sentence_generation(model, tokenizer, '가는', 5))

가는 말이 고와야 오는 말이 곱다


In [484]:
print(sentence_generation(model, tokenizer, '안녕', 5))

안녕 말이 말이 뛰고 있다 고와야


## LSTM 이용

### 데이터 전처리

In [497]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [498]:
df = pd.read_csv('/Users/timber/Personal/Study/NLP_study_tf/ArticlesApril2018.csv')
df.head(1)

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...


In [512]:
headline = list(df.headline.values)
headline[:5]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained',
 'How a Bag of Texas Dirt  Became a Times Tradition',
 'Is School a Place for Self-Expression?']

In [513]:
print(len(df))
df = df[df.headline != 'Unknown']
print(len(df))

headline = list(df.headline.values)
print(headline[:5])

1214
1214
['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell', 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.', 'The New Noma, Explained', 'How a Bag of Texas Dirt  Became a Times Tradition', 'Is School a Place for Self-Expression?']


In [514]:
def repreprocessing(raw_sentence):
    # 구두점 제거
    preprocessed_sentence = raw_sentence.encode('utf8').decode('ascii', 'ignore')
    return ''.join(word for word in preprocessed_sentence if word not in punctuation).lower()

preprocessed_headline = [repreprocessing(x) for x in headline]
preprocessed_headline[:5]

['former nfl cheerleaders settlement offer 1 and a meeting with goodell',
 'epa to unveil a new rule its effect less science in policymaking',
 'the new noma explained',
 'how a bag of texas dirt  became a times tradition',
 'is school a place for selfexpression']

In [515]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_headline)
vocab_size = len(tokenizer.word_index) + 1
print(f'단어 집합 크기 : {vocab_size}')

단어 집합 크기 : 3494


In [516]:
sequences = []

for sentence in preprocessed_headline:

    encoded = tokenizer.texts_to_sequences([sentence])[0] 
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

sequences[:11]

[[99, 269],
 [99, 269, 371],
 [99, 269, 371, 1115],
 [99, 269, 371, 1115, 582],
 [99, 269, 371, 1115, 582, 52],
 [99, 269, 371, 1115, 582, 52, 7],
 [99, 269, 371, 1115, 582, 52, 7, 2],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10],
 [99, 269, 371, 1115, 582, 52, 7, 2, 372, 10, 1116],
 [100, 3]]

In [520]:
index_to_word={}
for k, v in tokenizer.word_index.items():
    index_to_word[v] = k

In [518]:
max_len = max(len(l) for l in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

In [519]:
sequences = np.array(sequences)
X = sequences[:, :-1]
y = sequences[:, -1]

In [522]:
y = to_categorical(y, num_classes=vocab_size)

### 모델 설계

In [534]:
import torch.nn as nn
import torch

class text_LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(text_LSTM, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, 
                            hidden_size=hidden_dim, 
                            batch_first=True)
        self.linear = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.Softmax()

    def forward(self, inp):
        output = self.embedding(inp)
        _, (output, __) = self.lstm(output)
        output = self.linear(output)
#         output = self.softmax(output) 
# nn.CrossEntropyLoss에 이미 softmax 함수가 포함된거라서, 따로 해줄 필요는 없다
         
        return output

In [535]:
from sklearn.metrics import accuracy_score

embedding_dim = 10
hidden_dim = 128
lr = 0.001

model = text_LSTM(vocab_size=vocab_size,
                  embedding_dim=embedding_dim,
                  hidden_dim=hidden_dim)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(1, 201):
    train_losses, train_accs = [], []
    
    model.train()
    for inp, label in zip(X, y):
        inp = torch.Tensor([inp]).long()
        label = torch.Tensor(label).long()
        optimizer.zero_grad()
        output = model(inp)
        label_unsqueeze = torch.unsqueeze(label, 0)
        loss = criterion(output[0], torch.max(label_unsqueeze, 1)[1])

        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
    print(f"epoch {epoch:.2f} :: {np.average(train_losses)}")

epoch 1.00 :: 7.900943478086003
epoch 2.00 :: 6.728090529955612
epoch 3.00 :: 5.9013966943877225
epoch 4.00 :: 5.369305703029424
epoch 5.00 :: 5.020713646777665
epoch 6.00 :: 4.947484123967058
epoch 7.00 :: 4.8535552366062475
epoch 8.00 :: 4.748293452578934
epoch 9.00 :: 4.4281173842290364
epoch 10.00 :: 4.448720963308489
epoch 11.00 :: 4.108683002673569
epoch 12.00 :: 3.7010637753829645
epoch 13.00 :: 3.5861170948561822
epoch 14.00 :: 3.162512032427388
epoch 15.00 :: 2.9025705229767538
epoch 16.00 :: 2.7799290716031337
epoch 17.00 :: 2.498691011906624
epoch 18.00 :: 2.3217985779156445
epoch 19.00 :: 2.032820949528696
epoch 20.00 :: 1.8084147286388212
epoch 21.00 :: 1.8248933880376945
epoch 22.00 :: 1.6327982138771313
epoch 23.00 :: 1.473187281201269
epoch 24.00 :: 1.3461783292940726
epoch 25.00 :: 1.1519929370820345
epoch 26.00 :: 1.0416108766569303
epoch 27.00 :: 0.8889863503865574
epoch 28.00 :: 0.9498417435629661
epoch 29.00 :: 0.8037361822755393
epoch 30.00 :: 0.7736399621444847
e

KeyboardInterrupt: 

In [536]:
import torch.nn.functional as F

def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence = ''
    
    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')
        
        result = model(torch.Tensor(encoded).long())
        result = torch.max(F.softmax(result[0], dim=1), 1)[1]
        
        word = list(tokenizer.word_index.keys())[list(tokenizer.word_index.values()).index(result)]
        
        current_word = current_word + ' ' + word
        
        sentence = sentence + ' ' + word
    
    sentence = init_word + sentence
    return sentence

In [538]:
print(sentence_generation(model, tokenizer, 'i', 10))


i disapprove of school vouchers can i still apply for them
