<a href="https://colab.research.google.com/github/RogerHeederer/NLP_entry/blob/master/CharRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import urllib.request
from tensorflow.keras.utils import to_categorical

In [5]:
urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename='11-0.txt')
f = open('11-0.txt', 'rb')
lines=[]
for line in f:
  line=line.strip() #strip 통해서 \r, \n 제거
  line=line.lower() #소문자화
  line=line.decode('ascii', 'ignore') #\xe2\x80\x99 등과 같은 바이트 열 제거
  if len(line) > 0:
    lines.append(line)
f.close()

In [6]:
lines[:5] # 간단하게 1차 전처리 된 텍스트들

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  you may copy it, give it away or',
 're-use it under the terms of the project gutenberg license included',
 'with this ebook or online at www.gutenberg.org']

In [7]:
#문자열 전체 통합
text = ' '.join(lines)
print(text[:110])

the project gutenberg ebook of alices adventures in wonderland, by lewis carroll this ebook is for the use of 


In [8]:
print("전체 문자열의 길이 : %d" % len(text))

전체 문자열의 길이 : 159612


In [9]:
#위 통합 문자열로부터 글자 집합 만들기
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print('글자 집합의 크기 : {}'.format(vocab_size))

글자 집합의 크기 : 57


In [10]:
#방대한 양의 텍스트라도 집합의 크기가 char 단위이기 때문에 vocab_size가 많이 줄어든다
char_to_index = dict( (c,i) for i, c in enumerate(char_vocab))
print(char_to_index)

{' ': 0, '!': 1, '"': 2, '#': 3, '$': 4, '%': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, '@': 27, '[': 28, ']': 29, '_': 30, 'a': 31, 'b': 32, 'c': 33, 'd': 34, 'e': 35, 'f': 36, 'g': 37, 'h': 38, 'i': 39, 'j': 40, 'k': 41, 'l': 42, 'm': 43, 'n': 44, 'o': 45, 'p': 46, 'q': 47, 'r': 48, 's': 49, 't': 50, 'u': 51, 'v': 52, 'w': 53, 'x': 54, 'y': 55, 'z': 56}


In [11]:
index_to_char={}
for key, value in char_to_index.items():
  index_to_char[value] = key

In [12]:
print(index_to_char)

{0: ' ', 1: '!', 2: '"', 3: '#', 4: '$', 5: '%', 6: "'", 7: '(', 8: ')', 9: '*', 10: ',', 11: '-', 12: '.', 13: '/', 14: '0', 15: '1', 16: '2', 17: '3', 18: '4', 19: '5', 20: '6', 21: '7', 22: '8', 23: '9', 24: ':', 25: ';', 26: '?', 27: '@', 28: '[', 29: ']', 30: '_', 31: 'a', 32: 'b', 33: 'c', 34: 'd', 35: 'e', 36: 'f', 37: 'g', 38: 'h', 39: 'i', 40: 'j', 41: 'k', 42: 'l', 43: 'm', 44: 'n', 45: 'o', 46: 'p', 47: 'q', 48: 'r', 49: 's', 50: 't', 51: 'u', 52: 'v', 53: 'w', 54: 'x', 55: 'y', 56: 'z'}


In [13]:
#전체 문장을 정해진 문장 길이로 나누어 샘플들을 구한다
seq_length = 60 #문장 길이 60으로 정함
n_samples = int(np.floor((len(text) - 1) / seq_length)) # 문자열을 60등분한다. 그러면 즉, 총 샘플의 개수
print ('문장 샘플의 수 : {}'.format(n_samples))

문장 샘플의 수 : 2660


In [14]:
train_X = []
train_y = []

for i in range(n_samples): # 2660번 수행
  X_sample = text[i * seq_length: (i + 1) * seq_length] # 0:60 -> 60:120 -> 120:180
  X_encoded = [char_to_index[c] for c in X_sample]
  train_X.append(X_encoded)

  y_sample = text[i * seq_length + 1: (i + 1) * seq_length + 1] #1:61 -> 61:121 ... 시프트 1칸
  y_encoded = [char_to_index[c] for c in y_sample]
  train_y.append(y_encoded)

In [15]:
text[0:60]

'the project gutenberg ebook of alices adventures in wonderla'

In [16]:
print(train_X[0:60])

[[50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31], [44, 34, 10, 0, 32, 55, 0, 42, 35, 53, 39, 49, 0, 33, 31, 48, 48, 45, 42, 42, 0, 50, 38, 39, 49, 0, 35, 32, 45, 45, 41, 0, 39, 49, 0, 36, 45, 48, 0, 50, 38, 35, 0, 51, 49, 35, 0, 45, 36, 0, 31, 44, 55, 45, 44, 35, 0, 31, 44, 55], [53, 38, 35, 48, 35, 0, 31, 50, 0, 44, 45, 0, 33, 45, 49, 50, 0, 31, 44, 34, 0, 53, 39, 50, 38, 0, 31, 42, 43, 45, 49, 50, 0, 44, 45, 0, 48, 35, 49, 50, 48, 39, 33, 50, 39, 45, 44, 49, 0, 53, 38, 31, 50, 49, 45, 35, 52, 35, 48, 12], [0, 0, 55, 45, 51, 0, 43, 31, 55, 0, 33, 45, 46, 55, 0, 39, 50, 10, 0, 37, 39, 52, 35, 0, 39, 50, 0, 31, 53, 31, 55, 0, 45, 48, 0, 48, 35, 11, 51, 49, 35, 0, 39, 50, 0, 51, 44, 34, 35, 48, 0, 50, 38, 35, 0, 50, 35, 48, 43, 49], [0, 45, 36, 0, 50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 3

In [17]:
char_to_index['t'], char_to_index['h'], char_to_index['e']

(50, 38, 35)

In [18]:
print(train_y[0:60])

[[38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 44, 32, 35, 48, 37, 0, 35, 32, 45, 45, 41, 0, 45, 36, 0, 31, 42, 39, 33, 35, 49, 0, 31, 34, 52, 35, 44, 50, 51, 48, 35, 49, 0, 39, 44, 0, 53, 45, 44, 34, 35, 48, 42, 31, 44], [34, 10, 0, 32, 55, 0, 42, 35, 53, 39, 49, 0, 33, 31, 48, 48, 45, 42, 42, 0, 50, 38, 39, 49, 0, 35, 32, 45, 45, 41, 0, 39, 49, 0, 36, 45, 48, 0, 50, 38, 35, 0, 51, 49, 35, 0, 45, 36, 0, 31, 44, 55, 45, 44, 35, 0, 31, 44, 55, 53], [38, 35, 48, 35, 0, 31, 50, 0, 44, 45, 0, 33, 45, 49, 50, 0, 31, 44, 34, 0, 53, 39, 50, 38, 0, 31, 42, 43, 45, 49, 50, 0, 44, 45, 0, 48, 35, 49, 50, 48, 39, 33, 50, 39, 45, 44, 49, 0, 53, 38, 31, 50, 49, 45, 35, 52, 35, 48, 12, 0], [0, 55, 45, 51, 0, 43, 31, 55, 0, 33, 45, 46, 55, 0, 39, 50, 10, 0, 37, 39, 52, 35, 0, 39, 50, 0, 31, 53, 31, 55, 0, 45, 48, 0, 48, 35, 11, 51, 49, 35, 0, 39, 50, 0, 51, 44, 34, 35, 48, 0, 50, 38, 35, 0, 50, 35, 48, 43, 49, 0], [45, 36, 0, 50, 38, 35, 0, 46, 48, 45, 40, 35, 33, 50, 0, 37, 51, 50, 35, 4

In [19]:
#Char 단위 RNN에서는 입력 시퀀스에 대해 워드 임베딩 하지 않으므로
#train_x, y 둘다 원핫 인코딩한다.

train_X = to_categorical(train_X)
train_y = to_categorical(train_y)
print('train_X의 크기(shape) : {}'.format(train_X.shape)) # 원-핫 인코딩
print('train_y의 크기(shape) : {}'.format(train_y.shape)) # 원-핫 인코딩

#샘플의 수 2660, 입력 시퀀스의 길이 60, 각 벡터의 차원 57

train_X의 크기(shape) : (2660, 60, 57)
train_y의 크기(shape) : (2660, 60, 57)


In [20]:
train_X.shape[2]

57

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [22]:
model = Sequential()
model.add(LSTM(256, input_shape=(None, train_X.shape[2]), return_sequences=True))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

In [23]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_X, train_y, epochs=80, verbose=2)

Epoch 1/80
84/84 - 3s - loss: 3.0763 - accuracy: 0.1812
Epoch 2/80
84/84 - 3s - loss: 2.7800 - accuracy: 0.2341
Epoch 3/80
84/84 - 3s - loss: 2.4051 - accuracy: 0.3258
Epoch 4/80
84/84 - 3s - loss: 2.2489 - accuracy: 0.3614
Epoch 5/80
84/84 - 3s - loss: 2.1367 - accuracy: 0.3883
Epoch 6/80
84/84 - 3s - loss: 2.0514 - accuracy: 0.4095
Epoch 7/80
84/84 - 3s - loss: 1.9770 - accuracy: 0.4294
Epoch 8/80
84/84 - 3s - loss: 1.9176 - accuracy: 0.4450
Epoch 9/80
84/84 - 3s - loss: 1.8654 - accuracy: 0.4588
Epoch 10/80
84/84 - 3s - loss: 1.8158 - accuracy: 0.4730
Epoch 11/80
84/84 - 3s - loss: 1.7690 - accuracy: 0.4862
Epoch 12/80
84/84 - 3s - loss: 1.7316 - accuracy: 0.4955
Epoch 13/80
84/84 - 3s - loss: 1.6920 - accuracy: 0.5065
Epoch 14/80
84/84 - 3s - loss: 1.6544 - accuracy: 0.5177
Epoch 15/80
84/84 - 3s - loss: 1.6192 - accuracy: 0.5265
Epoch 16/80
84/84 - 3s - loss: 1.5832 - accuracy: 0.5356
Epoch 17/80
84/84 - 3s - loss: 1.5518 - accuracy: 0.5442
Epoch 18/80
84/84 - 3s - loss: 1.5197 - 

<tensorflow.python.keras.callbacks.History at 0x7f2819963fd0>

In [24]:
def sentence_generation(model, length):
  ix = [np.random.randint(vocab_size)] #랜덤 인덱스 생성
  y_char = [index_to_char[ix[-1]]] #랜덤 인덱스로부터 글자 생성
  print(y_char, '\n')
  print(ix[-1],'번 글자',y_char[-1],'로 예측을 시작')
  X = np.zeros((1, length, vocab_size)) #(1, length, 57) 크기의 X 생성

  for i in range(length):
      X[0][i][ix[-1]] = 1 # X[0][i][예측한 글자의 인덱스] = 1, 즉, 예측 글자를 다음 입력 시퀀스에 추가
      print(index_to_char[ix[-1]], end="")
      ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
      y_char.append(index_to_char[ix[-1]])
  return ('').join(y_char)

In [25]:
sentence_generation(model, 100)

['-'] 

11 번 글자 - 로 예측을 시작
-apple, roast turkey, toffee, and houroughaly provid tuxte. cunsion to herself off the sort, said al

'-apple, roast turkey, toffee, and houroughaly provid tuxte. cunsion to herself off the sort, said ali'