In [13]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

# 데이터 로드
urllib.request.urlretrieve("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", filename="shake.txt")

f = open('shake.txt', 'rb')
sentences = []
for sentence in f: # 데이터로부터 한 줄씩 읽는다.
    sentence = sentence.strip() # strip()을 통해 \r, \n을 제거한다.
    sentence = sentence.lower() # 소문자화.
    sentence = sentence.decode('ascii', 'ignore') # \xe2\x80\x99 등과 같은 바이트 열 제거
    if len(sentence) > 0:
        sentences.append(sentence)
f.close()

In [14]:
sentences[:5]

['first citizen:',
 'before we proceed any further, hear me speak.',
 'all:',
 'speak, speak.',
 'first citizen:']

In [15]:
total_data = ' '.join(sentences)
print('문자열의 길이 또는 총 문자의 개수: %d' % len(total_data))

문자열의 길이 또는 총 문자의 개수: 1108166


In [16]:
print(total_data[:200])

first citizen: before we proceed any further, hear me speak. all: speak, speak. first citizen: you are all resolved rather to die than to famish? all: resolved. resolved. first citizen: first, you kno


In [17]:
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))

문자 집합의 크기 : 38


In [18]:
# 문자에 고유한 정수 부여
char_to_index = dict((char, index) for index, char in enumerate(char_vocab))
print('문자 집합 :',char_to_index)

문자 집합 : {' ': 0, '!': 1, '$': 2, '&': 3, "'": 4, ',': 5, '-': 6, '.': 7, '3': 8, ':': 9, ';': 10, '?': 11, 'a': 12, 'b': 13, 'c': 14, 'd': 15, 'e': 16, 'f': 17, 'g': 18, 'h': 19, 'i': 20, 'j': 21, 'k': 22, 'l': 23, 'm': 24, 'n': 25, 'o': 26, 'p': 27, 'q': 28, 'r': 29, 's': 30, 't': 31, 'u': 32, 'v': 33, 'w': 34, 'x': 35, 'y': 36, 'z': 37}


In [19]:
index_to_char = {}
for key, value in char_to_index.items():
    index_to_char[value] = key

In [20]:
seq_length = 60

# 문자열의 길이를 seq_length로 나누면 전처리 후 생겨날 샘플 수
n_samples = int(np.floor((len(total_data) - 1) / seq_length))
print ('샘플의 수 : {}'.format(n_samples))

샘플의 수 : 18469


In [21]:
train_X = []
train_y = []

for i in range(n_samples):
    # 0:60 -> 60:120 -> 120:180로 loop를 돌면서 문장 샘플을 1개씩 pick.
    X_sample = total_data[i * seq_length: (i + 1) * seq_length]

    # 정수 인코딩
    X_encoded = [char_to_index[c] for c in X_sample]
    train_X.append(X_encoded)

    # 오른쪽으로 1칸 쉬프트
    y_sample = total_data[i * seq_length + 1: (i + 1) * seq_length + 1]
    y_encoded = [char_to_index[c] for c in y_sample]
    train_y.append(y_encoded)

In [22]:
print('X 데이터의 첫번째 샘플 :',train_X[0])
print('y 데이터의 첫번째 샘플 :',train_y[0])
print('-'*50)
print('X 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_X[0]])
print('y 데이터의 첫번째 샘플 디코딩 :',[index_to_char[i] for i in train_y[0]])

X 데이터의 첫번째 샘플 : [17, 20, 29, 30, 31, 0, 14, 20, 31, 20, 37, 16, 25, 9, 0, 13, 16, 17, 26, 29, 16, 0, 34, 16, 0, 27, 29, 26, 14, 16, 16, 15, 0, 12, 25, 36, 0, 17, 32, 29, 31, 19, 16, 29, 5, 0, 19, 16, 12, 29, 0, 24, 16, 0, 30, 27, 16, 12, 22, 7]
y 데이터의 첫번째 샘플 : [20, 29, 30, 31, 0, 14, 20, 31, 20, 37, 16, 25, 9, 0, 13, 16, 17, 26, 29, 16, 0, 34, 16, 0, 27, 29, 26, 14, 16, 16, 15, 0, 12, 25, 36, 0, 17, 32, 29, 31, 19, 16, 29, 5, 0, 19, 16, 12, 29, 0, 24, 16, 0, 30, 27, 16, 12, 22, 7, 0]
--------------------------------------------------
X 데이터의 첫번째 샘플 디코딩 : ['f', 'i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' ', 'p', 'r', 'o', 'c', 'e', 'e', 'd', ' ', 'a', 'n', 'y', ' ', 'f', 'u', 'r', 't', 'h', 'e', 'r', ',', ' ', 'h', 'e', 'a', 'r', ' ', 'm', 'e', ' ', 's', 'p', 'e', 'a', 'k', '.']
y 데이터의 첫번째 샘플 디코딩 : ['i', 'r', 's', 't', ' ', 'c', 'i', 't', 'i', 'z', 'e', 'n', ':', ' ', 'b', 'e', 'f', 'o', 'r', 'e', ' ', 'w', 'e', ' '

In [23]:
print(train_X[1])
print(train_y[1])

[0, 12, 23, 23, 9, 0, 30, 27, 16, 12, 22, 5, 0, 30, 27, 16, 12, 22, 7, 0, 17, 20, 29, 30, 31, 0, 14, 20, 31, 20, 37, 16, 25, 9, 0, 36, 26, 32, 0, 12, 29, 16, 0, 12, 23, 23, 0, 29, 16, 30, 26, 23, 33, 16, 15, 0, 29, 12, 31, 19]
[12, 23, 23, 9, 0, 30, 27, 16, 12, 22, 5, 0, 30, 27, 16, 12, 22, 7, 0, 17, 20, 29, 30, 31, 0, 14, 20, 31, 20, 37, 16, 25, 9, 0, 36, 26, 32, 0, 12, 29, 16, 0, 12, 23, 23, 0, 29, 16, 30, 26, 23, 33, 16, 15, 0, 29, 12, 31, 19, 16]


In [24]:
train_X = to_categorical(train_X)
train_y = to_categorical(train_y)

print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

train_X의 크기(shape) : (18469, 60, 38)
train_y의 크기(shape) : (18469, 60, 38)


In [43]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

hidden_units = 256

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(hidden_units, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=100, verbose=2)

Epoch 1/100
578/578 - 9s - loss: 2.3847 - accuracy: 0.3217 - 9s/epoch - 16ms/step
Epoch 2/100
578/578 - 5s - loss: 1.8773 - accuracy: 0.4412 - 5s/epoch - 9ms/step
Epoch 3/100
578/578 - 5s - loss: 1.6895 - accuracy: 0.4923 - 5s/epoch - 9ms/step
Epoch 4/100
578/578 - 5s - loss: 1.5756 - accuracy: 0.5230 - 5s/epoch - 9ms/step
Epoch 5/100
578/578 - 5s - loss: 1.5014 - accuracy: 0.5423 - 5s/epoch - 9ms/step
Epoch 6/100
578/578 - 5s - loss: 1.4481 - accuracy: 0.5556 - 5s/epoch - 9ms/step
Epoch 7/100
578/578 - 5s - loss: 1.4084 - accuracy: 0.5658 - 5s/epoch - 9ms/step
Epoch 8/100
578/578 - 5s - loss: 1.3759 - accuracy: 0.5738 - 5s/epoch - 9ms/step
Epoch 9/100
578/578 - 5s - loss: 1.3490 - accuracy: 0.5806 - 5s/epoch - 9ms/step
Epoch 10/100
578/578 - 5s - loss: 1.3254 - accuracy: 0.5866 - 5s/epoch - 9ms/step
Epoch 11/100
578/578 - 5s - loss: 1.3044 - accuracy: 0.5927 - 5s/epoch - 8ms/step
Epoch 12/100
578/578 - 5s - loss: 1.2850 - accuracy: 0.5973 - 5s/epoch - 9ms/step
Epoch 13/100
578/578 - 5

<keras.src.callbacks.History at 0x7ce7416fed40>

In [52]:
def sentence_generation(model, length):
    # 문자에 대한 랜덤한 정수 생성
    ix = [np.random.randint(vocab_size)]

    # 랜덤한 정수로부터 맵핑되는 문자 생성
    y_char = [index_to_char[ix[-1]]]
    print(ix[-1],'번 문자', y_char[-1],'로 예측을 시작!')

    # (1, length, 55) 크기의 X 생성. 즉, LSTM의 입력 시퀀스 생성
    X = np.zeros((1, length, vocab_size))

    for i in range(length):
        # X[0][i][예측한 문자의 인덱스] = 1, 즉, 예측 문자를 다음 입력 시퀀스에 추가
        X[0][i][ix[-1]] = 1
        print(index_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(index_to_char[ix[-1]])
    return ('').join(y_char)


In [53]:
result = sentence_generation(model, 100)
print(result)

28 번 문자 q 로 예측을 시작!
queen elizabeth: o thou hast surp acces on our manner i'll disease you speak of care, thou shalt awak
