In [3]:
import os
from keras.utils import pad_sequences
import keras
import tensorflow as tf
import numpy as np

In [4]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

In [5]:
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

In [6]:
vocab = sorted(set(text))
char2idx = {u:i for i,u in enumerate(vocab)}
idx2char = np.array(vocab)

def text_to_int(text):
    return np.array([char2idx[c] for c in text])

text_as_int = text_to_int(text)

In [7]:
def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])
    

In [10]:
seq_length = 100 #Hell -> ello ==> Hello(input+output)
examples_per_epoch = len(text)//(seq_length+1) #학습할때 101개의 단어들이 필요하기에 epoch를 이렇게 설정한다

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int) #array를 잘라서 
print(list(char_dataset)[0])

tf.Tensor(18, shape=(), dtype=int64)


In [51]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [52]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    output_text = chunk[1:]
    return input_text, output_text

dataset = sequences.map(split_input_target)

In [53]:
BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 256
RNN_UNITS = 1024

BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
#shuffle을 하는 이유는 단어 학습을 시키기 위함이고 나중에 문장 학습 할때는 batch size = 1, no shuffle로 한다

In [54]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                batch_input_shape=[batch_size, None]), #None으로 하는 이유는 나중에 이 모델을 쓸때 얼마나 긴 문장을 쓸지 모르기때문
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (64, None, 256)           16640     
                                                                 
 lstm_2 (LSTM)               (64, None, 1024)          5246976   
                                                                 
 dense_2 (Dense)             (64, None, 65)            66625     
                                                                 
Total params: 5,330,241
Trainable params: 5,330,241
Non-trainable params: 0
_________________________________________________________________


In [44]:
for input_example_batch, target_example_batch in data.take(1):
    example_batch_prediction = model(input_example_batch)
    print(example_batch_prediction.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 65) # (batch_size, sequence_length, vocab_size)


In [65]:
#배치 갯수
print(len(example_batch_prediction))
print(example_batch_prediction)

64
tf.Tensor(
[[[-1.78233965e-03  1.50419958e-03 -3.88172455e-04 ...  3.15168081e-03
   -6.03858987e-03  2.92667607e-03]
  [-1.03026943e-03 -8.37136758e-05  3.00721498e-04 ... -3.71807930e-03
   -2.27467157e-03 -1.72761700e-03]
  [-3.64894513e-03 -2.04798672e-03  6.75425772e-03 ... -2.57171644e-03
   -4.10743244e-03  9.11758339e-04]
  ...
  [-6.27422472e-03 -3.83061310e-03  5.94043639e-03 ... -1.54265091e-02
   -1.17436759e-02 -3.76639003e-03]
  [-6.37861015e-03 -4.26725578e-03  3.98376957e-03 ... -1.12548089e-02
   -1.43368207e-02 -5.46216127e-03]
  [-5.31135732e-03  5.80312964e-03 -2.46041454e-05 ... -8.52638204e-03
   -1.52471503e-02 -4.28096205e-03]]

 [[ 1.53944886e-04 -7.89211597e-04  1.26124639e-03 ... -6.13791170e-03
    2.89706048e-03 -3.71359335e-03]
  [-2.72703427e-03 -1.41478702e-03  3.38712684e-03 ... -6.64508017e-03
    7.77630601e-04 -6.99822977e-03]
  [-4.55133291e-03  2.61627021e-03  7.93362502e-03 ... -4.97110514e-03
    1.27438060e-03 -1.91380095e-03]
  ...
  [-2.760

In [64]:
#100개의 time step 예측들
pred = example_batch_prediction[0]
print(len(pred))
print(pred)

100
tf.Tensor(
[[-1.7823396e-03  1.5041996e-03 -3.8817246e-04 ...  3.1516808e-03
  -6.0385899e-03  2.9266761e-03]
 [-1.0302694e-03 -8.3713676e-05  3.0072150e-04 ... -3.7180793e-03
  -2.2746716e-03 -1.7276170e-03]
 [-3.6489451e-03 -2.0479867e-03  6.7542577e-03 ... -2.5717164e-03
  -4.1074324e-03  9.1175834e-04]
 ...
 [-6.2742247e-03 -3.8306131e-03  5.9404364e-03 ... -1.5426509e-02
  -1.1743676e-02 -3.7663900e-03]
 [-6.3786102e-03 -4.2672558e-03  3.9837696e-03 ... -1.1254809e-02
  -1.4336821e-02 -5.4621613e-03]
 [-5.3113573e-03  5.8031296e-03 -2.4604145e-05 ... -8.5263820e-03
  -1.5247150e-02 -4.2809620e-03]], shape=(100, 65), dtype=float32)


In [63]:
#65개의 글자 예측
time_step = pred[0]
print(len(time_step))
print(time_step)

65
tf.Tensor(
[-1.7823396e-03  1.5041996e-03 -3.8817246e-04 -3.9265936e-04
  3.1650728e-03 -4.0468574e-03 -1.7952236e-03  2.4061592e-04
 -1.9356509e-03 -1.1999721e-03 -9.2524663e-04  8.0917799e-04
  2.6031784e-03 -8.2836353e-04  3.1787497e-03  5.3058388e-03
 -3.4365058e-03  1.8296164e-03  5.9658079e-05 -2.4150172e-04
  5.1288912e-03 -1.0444942e-03  6.4438297e-03 -2.3077161e-03
 -2.5550909e-03  2.7100041e-03  7.8152516e-04 -2.5708636e-03
 -3.7814374e-04 -4.7471980e-03 -1.7017433e-03  7.2944211e-04
 -4.5172847e-04  2.3371023e-03  7.4250055e-03 -2.2213510e-03
 -2.5738417e-03 -4.9755317e-03 -2.1055851e-03 -1.3521982e-03
 -3.8657421e-03  3.0829967e-03  3.6945543e-03  2.2579385e-03
 -3.4161941e-03  1.0992354e-04  2.4486880e-04 -4.3441178e-03
  1.5874807e-03 -4.4112979e-03  3.6623906e-03  4.0790723e-03
  4.1058445e-03 -1.6618724e-04 -3.6860721e-03  4.5791303e-04
  3.7779808e-03  2.1755155e-03  5.1029515e-04  1.7979462e-04
  4.8551438e-03 -2.0518475e-03  3.1516808e-03 -6.0385899e-03
  2.926676

In [66]:
sample_indices = tf.random.categorical(pred, num_samples=1)

sample_indices = np.reshape(sample_indices, (1, -1))[0]
predicted_chars = int_to_text(sample_indices)

predicted_chars

'mYUXTUc;uvW cyMMlaViluUQs.$on!dpPPtH GRCma!\nvxSVqwnCUedG:?dJmOFd?-JK&ycNzdlY\neTpZdrANZNrRwza&BDXTppJ'

In [71]:
def loss(labels, logits): #logits = probability distribution
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [72]:
model.compile(optimizer="adam", loss=loss)

In [73]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [None]:
history = model.fit(data, epochs=40, callbacks=[checkpoint_callback])

In [None]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

In [None]:
model.load_weights(tf.train.lastest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1,None])

In [1]:
def generate_text(model, start_string):
    #만들 단어 갯수
    num_generate = 800

    #시작 단어를 모델의 인풋에 맞게 조절
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0) #[1,2,3] -> [[1,2,3]]

    text_generated = []
    temperature = 1.0 # 낮을수록 모델에서 구한 posibility distribution을 따르고 높을수록 안따름

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(prediction, 0) #[[1,2,3]] -> [1,2,3]

        #categorical distribution 작업
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.expand_dims(idx2char[predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))


In [76]:
inp = input("input starting string: ")
print(generated_text(inp))