# 단어 예측

In [1]:
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [2]:
text_data='''Soft as the voice of an angel
Breathing a lesson unhead
Hope with a gentle persuasion
Whispers her comforting word
Wait till the darkness is over
Wait till the tempest is done
Hope for sunshine tomorrow
After the shower
'''

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text_data])
print(tokenizer.word_index)

{'the': 1, 'a': 2, 'hope': 3, 'wait': 4, 'till': 5, 'is': 6, 'soft': 7, 'as': 8, 'voice': 9, 'of': 10, 'an': 11, 'angel': 12, 'breathing': 13, 'lesson': 14, 'unhead': 15, 'with': 16, 'gentle': 17, 'persuasion': 18, 'whispers': 19, 'her': 20, 'comforting': 21, 'word': 22, 'darkness': 23, 'over': 24, 'tempest': 25, 'done': 26, 'for': 27, 'sunshine': 28, 'tomorrow': 29, 'after': 30, 'shower': 31}


In [4]:
encoded = tokenizer.texts_to_sequences([text_data])[0]
print(encoded)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

[7, 8, 1, 9, 10, 11, 12, 13, 2, 14, 15, 3, 16, 2, 17, 18, 19, 20, 21, 22, 4, 5, 1, 23, 6, 24, 4, 5, 1, 25, 6, 26, 3, 27, 28, 29, 30, 1, 31]
32


In [5]:
sequences = [encoded[i - 2:i + 1] for i in range(2, len(encoded))]
print(sequences)
print(len(sequences))

sequences = np.array(sequences)
X, y = sequences[:, :2], sequences[:, 2]
print('X', X)
print('y', y)

[[7, 8, 1], [8, 1, 9], [1, 9, 10], [9, 10, 11], [10, 11, 12], [11, 12, 13], [12, 13, 2], [13, 2, 14], [2, 14, 15], [14, 15, 3], [15, 3, 16], [3, 16, 2], [16, 2, 17], [2, 17, 18], [17, 18, 19], [18, 19, 20], [19, 20, 21], [20, 21, 22], [21, 22, 4], [22, 4, 5], [4, 5, 1], [5, 1, 23], [1, 23, 6], [23, 6, 24], [6, 24, 4], [24, 4, 5], [4, 5, 1], [5, 1, 25], [1, 25, 6], [25, 6, 26], [6, 26, 3], [26, 3, 27], [3, 27, 28], [27, 28, 29], [28, 29, 30], [29, 30, 1], [30, 1, 31]]
37
X [[ 7  8]
 [ 8  1]
 [ 1  9]
 [ 9 10]
 [10 11]
 [11 12]
 [12 13]
 [13  2]
 [ 2 14]
 [14 15]
 [15  3]
 [ 3 16]
 [16  2]
 [ 2 17]
 [17 18]
 [18 19]
 [19 20]
 [20 21]
 [21 22]
 [22  4]
 [ 4  5]
 [ 5  1]
 [ 1 23]
 [23  6]
 [ 6 24]
 [24  4]
 [ 4  5]
 [ 5  1]
 [ 1 25]
 [25  6]
 [ 6 26]
 [26  3]
 [ 3 27]
 [27 28]
 [28 29]
 [29 30]
 [30  1]]
y [ 1  9 10 11 12 13  2 14 15  3 16  2 17 18 19 20 21 22  4  5  1 23  6 24
  4  5  1 25  6 26  3 27 28 29 30  1 31]


In [6]:
model = Sequential()

In [7]:
model.add(Embedding(vocab_size, 10, input_length=2)) # input_length: 한번에 두개 넣음



In [8]:
model.add(LSTM(50))

2024-06-10 14:19:11.929648: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-06-10 14:19:11.929682: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-06-10 14:19:11.929688: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-06-10 14:19:11.929705: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-06-10 14:19:11.929717: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
model.add(Dense(vocab_size, activation='softmax'))

In [10]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [11]:
model.summary()

In [12]:
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500


2024-06-10 14:19:12.336761: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


2/2 - 4s - 2s/step - accuracy: 0.0000e+00 - loss: 3.4659
Epoch 2/500
2/2 - 0s - 29ms/step - accuracy: 0.1351 - loss: 3.4642
Epoch 3/500
2/2 - 0s - 20ms/step - accuracy: 0.0811 - loss: 3.4627
Epoch 4/500
2/2 - 0s - 19ms/step - accuracy: 0.1081 - loss: 3.4615
Epoch 5/500
2/2 - 0s - 19ms/step - accuracy: 0.1081 - loss: 3.4602
Epoch 6/500
2/2 - 0s - 19ms/step - accuracy: 0.1081 - loss: 3.4589
Epoch 7/500
2/2 - 0s - 20ms/step - accuracy: 0.1081 - loss: 3.4576
Epoch 8/500
2/2 - 0s - 18ms/step - accuracy: 0.1081 - loss: 3.4564
Epoch 9/500
2/2 - 0s - 16ms/step - accuracy: 0.1081 - loss: 3.4552
Epoch 10/500
2/2 - 0s - 16ms/step - accuracy: 0.1081 - loss: 3.4540
Epoch 11/500
2/2 - 0s - 16ms/step - accuracy: 0.1081 - loss: 3.4527
Epoch 12/500
2/2 - 0s - 15ms/step - accuracy: 0.1081 - loss: 3.4514
Epoch 13/500
2/2 - 0s - 17ms/step - accuracy: 0.1081 - loss: 3.4500
Epoch 14/500
2/2 - 0s - 15ms/step - accuracy: 0.1081 - loss: 3.4486
Epoch 15/500
2/2 - 0s - 15ms/step - accuracy: 0.1081 - loss: 3.4472

<keras.src.callbacks.history.History at 0x1743a1b10>

## 학습 결과 테스트

In [13]:
test_text = 'Wait'
encoded = tokenizer.texts_to_sequences([test_text])[0]
encoded = np.array([encoded])
print(encoded.shape)

(1, 1)


In [14]:
onehot_output = model.predict(encoded)
print(onehot_output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[[6.0479151e-04 2.1110310e-01 1.6240206e-02 4.5024846e-02 1.3121497e-02
  3.5725251e-01 1.7285332e-03 9.2107488e-04 1.0363779e-03 2.1331990e-02
  9.6545398e-04 1.4459365e-03 3.4126136e-04 3.4049188e-04 1.2947497e-02
  4.6675313e-02 3.2152873e-03 4.6650615e-02 2.0576327e-03 2.1725596e-04
  8.0379117e-03 2.4117477e-02 2.0786755e-02 5.5291271e-03 4.1799182e-03
  7.0296782e-03 4.9430557e-04 3.4071181e-02 2.7841814e-03 3.2404128e-02
  7.6183572e-02 1.1600822e-03]]


In [15]:
output = np.argmax(onehot_output)
print(output)

5


In [16]:
for word, index in tokenizer.word_index.items():
    if index == output:
        print(f'{test_text} => {word}')

Wait => till
