In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense

model = Sequential()
model.add(Embedding(5000, 100))
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 100)         500000    
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               29312     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 529,441
Trainable params: 529,441
Non-trainable params: 0
_________________________________________________________________


In [4]:
# Embedding = 5,000(input) * 100(embedding) = 500,000
# Wx = 100(embedding) * 128(hidden) = 12,800
# Wh = 128 * 128 = 16,384
# bias(hidden)  = 128
# Wy = 128
# bias(output) = 1

### LSTM cell_state와 hidden_state차이 설명있는 링크

[cell state와 hiedden state의 차이](https://medium.com/analytics-vidhya/lstms-explained-a-complete-technically-accurate-conceptual-guide-with-keras-2a650327e8f2)

## 케라스의 Simple RNN과 LSTM이해하기

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Bidirectional

train_x = [[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]
print(np.shape(train_x))

(4, 5)


In [11]:
train_x = [[[0.1, 4.2, 1.5, 1.1, 2.8], [1.0, 3.1, 2.5, 0.7, 1.1], [0.3, 2.1, 1.5, 2.1, 0.1], [2.2, 1.4, 0.5, 0.9, 1.1]]]
train_x = np.array(train_x, dtype=np.float32)
print(train_x.shape)

(1, 4, 5)


In [12]:
rnn = SimpleRNN(3)
# rnn = SimpleRNN(3, return_sequence=False, return_state=False)와 동일함
hidden_state = rnn(train_x)

print('hidden state : {}, shape : {}'.format(hidden_state, hidden_state.shape))

hidden state : [[-0.95729214  0.2321412  -0.80941206]], shape : (1, 3)


In [13]:
rnn = SimpleRNN(3, return_sequences=True)
hidden_state = rnn(train_x)
print('hidden state : {}, shape : {}'.format(hidden_state, hidden_state.shape))

hidden state : [[[-0.99868554 -0.99195284  0.9958747 ]
  [-0.9999305  -0.2694045   0.8647588 ]
  [-0.9967939  -0.9049087   0.8768172 ]
  [-0.99918365  0.94886655  0.50997233]]], shape : (1, 4, 3)


In [14]:
rnn = SimpleRNN(3, return_sequences=True, return_state=True)
hidden_states, last_state = rnn(train_x)
print('hidden states : {}, shpae : {}'.format(hidden_states, hidden_states.shape))
print('last hidden state : {}, shape : {}'.format(last_state, last_state.shape))

hidden states : [[[ 0.97910935 -0.49686566  0.99627894]
  [ 0.9931132  -0.8275732   0.999276  ]
  [ 0.5923131  -0.9827416   0.9811445 ]
  [ 0.97378886 -0.9870051   0.972459  ]]], shpae : (1, 4, 3)
last hidden state : [[ 0.97378886 -0.9870051   0.972459  ]], shape : (1, 3)


## RNN을 이용한 텍스트 생성

In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

text = '''경마장에 있는 말이 뛰고 있다
그의 말이 법이다
가는 말이 고와야 오는 말이 곱다
'''

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

12


In [2]:
tokenizer.word_index

{'말이': 1,
 '경마장에': 2,
 '있는': 3,
 '뛰고': 4,
 '있다': 5,
 '그의': 6,
 '법이다': 7,
 '가는': 8,
 '고와야': 9,
 '오는': 10,
 '곱다': 11}

In [4]:
sequences = list()
for line in text.split('\n'):
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print(len(sequences))

11


In [6]:
sequences

[[2, 3],
 [2, 3, 1],
 [2, 3, 1, 4],
 [2, 3, 1, 4, 5],
 [6, 1],
 [6, 1, 7],
 [8, 1],
 [8, 1, 9],
 [8, 1, 9, 10],
 [8, 1, 9, 10, 1],
 [8, 1, 9, 10, 1, 11]]

In [7]:
max_len = max(len(i) for i in sequences)
print(max_len)

6


In [9]:
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)

[[ 0  0  0  0  2  3]
 [ 0  0  0  2  3  1]
 [ 0  0  2  3  1  4]
 [ 0  2  3  1  4  5]
 [ 0  0  0  0  6  1]
 [ 0  0  0  6  1  7]
 [ 0  0  0  0  8  1]
 [ 0  0  0  8  1  9]
 [ 0  0  8  1  9 10]
 [ 0  8  1  9 10  1]
 [ 8  1  9 10  1 11]]


In [10]:
sequences = np.array(sequences)
x = sequences[:, :-1]
y = sequences[:, -1]

In [11]:
x

array([[ 0,  0,  0,  0,  2],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  2,  3,  1],
       [ 0,  2,  3,  1,  4],
       [ 0,  0,  0,  0,  6],
       [ 0,  0,  0,  6,  1],
       [ 0,  0,  0,  0,  8],
       [ 0,  0,  0,  8,  1],
       [ 0,  0,  8,  1,  9],
       [ 0,  8,  1,  9, 10],
       [ 8,  1,  9, 10,  1]])

In [13]:
y = to_categorical(y, num_classes=vocab_size)
print(y)

[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

model = Sequential()
model.add(Embedding(vocab_size, 10))
model.add(SimpleRNN(32))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.fit(x, y, epochs=200, verbose=2)

Epoch 1/200
1/1 - 2s - loss: 2.4747 - acc: 0.1818 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 2.4616 - acc: 0.1818 - 14ms/epoch - 14ms/step
Epoch 3/200
1/1 - 0s - loss: 2.4485 - acc: 0.0909 - 16ms/epoch - 16ms/step
Epoch 4/200
1/1 - 0s - loss: 2.4353 - acc: 0.2727 - 18ms/epoch - 18ms/step
Epoch 5/200
1/1 - 0s - loss: 2.4219 - acc: 0.2727 - 11ms/epoch - 11ms/step
Epoch 6/200
1/1 - 0s - loss: 2.4083 - acc: 0.4545 - 15ms/epoch - 15ms/step
Epoch 7/200
1/1 - 0s - loss: 2.3944 - acc: 0.4545 - 13ms/epoch - 13ms/step
Epoch 8/200
1/1 - 0s - loss: 2.3801 - acc: 0.4545 - 15ms/epoch - 15ms/step
Epoch 9/200
1/1 - 0s - loss: 2.3654 - acc: 0.4545 - 14ms/epoch - 14ms/step
Epoch 10/200
1/1 - 0s - loss: 2.3502 - acc: 0.4545 - 15ms/epoch - 15ms/step
Epoch 11/200
1/1 - 0s - loss: 2.3344 - acc: 0.4545 - 14ms/epoch - 14ms/step
Epoch 12/200
1/1 - 0s - loss: 2.3181 - acc: 0.4545 - 15ms/epoch - 15ms/step
Epoch 13/200
1/1 - 0s - loss: 2.3011 - acc: 0.4545 - 14ms/epoch - 14ms/step
Epoch 14/200
1/1 - 0s - l

<keras.callbacks.History at 0x1a5493fd8b0>

In [16]:
def sentence_generation(model, tokenizer, current_word, n):
    init_word = current_word
    sentence= ''

    for _ in range(n):
        encoded = tokenizer.texts_to_sequences([current_word])[0]
        encoded = pad_sequences([encoded], maxlen=5, padding='pre')

        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for word, index in tokenizer.word_index.items():
            if index == result:
                break
        
        current_word = current_word + ' ' + word
        sentence = sentence + ' ' + word

    sentence = init_word + sentence
    return sentence

In [17]:
print(sentence_generation(model, tokenizer, '경마장에', 4))

경마장에 있는 말이 뛰고 있다


In [18]:
print(sentence_generation(model, tokenizer, '그의', 2))

그의 말이 법이다


In [20]:
print(sentence_generation(model, tokenizer, '가는', 5))

가는 말이 고와야 오는 말이 곱다


## LSTM을 이용하여 텍스트 생성하기

In [21]:
import pandas as pd
import numpy as np
from string import punctuation

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

df = pd.read_csv('C:/Users/Myeong/dding/data/ArticlesApril2018.csv')
df.head()

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [23]:
print(df.columns)
print(len(df.columns))

Index(['articleID', 'articleWordCount', 'byline', 'documentType', 'headline',
       'keywords', 'multimedia', 'newDesk', 'printPage', 'pubDate',
       'sectionName', 'snippet', 'source', 'typeOfMaterial', 'webURL'],
      dtype='object')
15


In [24]:
df['headline'].isnull().sum()

0

In [25]:
headline = []
headline.extend(list(df.headline.values))
headline[:3]

['Former N.F.L. Cheerleaders’ Settlement Offer: $1 and a Meeting With Goodell',
 'E.P.A. to Unveil a New Rule. Its Effect: Less Science in Policymaking.',
 'The New Noma, Explained']

In [26]:
print(len(headline))
headline = [x for x in headline if x != 'Unknown']
print(len(headline))

1324
1214
