In [1]:
import tensorflow as tf
import numpy as np
import os
import time
import urllib.request
import re




In [3]:
url = "https://www.gutenberg.org/cache/epub/6130/pg6130.txt"
file = urllib.request.urlopen(url)
text = [line.decode('utf-8') for line in file]
text = ''.join(text)
text = re.sub(' +',' ',text)
text = re.sub(r'[^A-Za-z.,!\r ]+', '', text)
text = text[1150:]
text[:200]

'X.\r BOOK XXI.\r BOOK XXII.\r BOOK XXIII.\r BOOK XXIV.\r\r CONCLUDING NOTE.\r\r\r\r\rIllustrations\r\r HOMER INVOKING THE MUSE\r MARS\r MINERVA REPRESSING THE FURY OF ACHILLES\r THE DEPARTURE OF BRISEIS FROM THE TENT'

In [4]:
vocab = sorted(set(text))
",".join(vocab)

'\r, ,!,,,.,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z'

In [5]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])
[f"{char} = {i}" for char, i in zip(char2idx, range(20))]

['\r = 0',
 '  = 1',
 '! = 2',
 ', = 3',
 '. = 4',
 'A = 5',
 'B = 6',
 'C = 7',
 'D = 8',
 'E = 9',
 'F = 10',
 'G = 11',
 'H = 12',
 'I = 13',
 'J = 14',
 'K = 15',
 'L = 16',
 'M = 17',
 'N = 18',
 'O = 19']

In [6]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
[idx2char[i.numpy()] for i in char_dataset.take(5)]

['X', '.', '\r', ' ', 'B']

In [7]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)
[repr(''.join(idx2char[item.numpy()])) for item in sequences.take(5)]

["'X.\\r BOOK XXI.\\r BOOK XXII.\\r BOOK XXIII.\\r BOOK XXIV.\\r\\r CONCLUDING NOTE.\\r\\r\\r\\r\\rIllustrations\\r\\r HOMER INVOK'",
 "'ING THE MUSE\\r MARS\\r MINERVA REPRESSING THE FURY OF ACHILLES\\r THE DEPARTURE OF BRISEIS FROM THE TENT O'",
 "'F ACHILLES\\r THETIS CALLING BRIAREUS TO THE ASSISTANCE OF JUPITER\\r THETIS ENTREATING JUPITER TO HONOUR'",
 "' ACHILLES\\r VULCAN\\r JUPITER\\r THE APOTHEOSIS OF HOMER\\r JUPITER SENDING THE EVIL DREAM TO AGAMEMNON\\r NEP'",
 "'TUNE\\r VENUS, DISGUISED, INVITING HELEN TO THE CHAMBER OF PARIS\\r VENUS PRESENTING HELEN TO PARIS\\r VENU'"]

In [8]:
@tf.autograph.experimental.do_not_convert
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [9]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'X.\r BOOK XXI.\r BOOK XXII.\r BOOK XXIII.\r BOOK XXIV.\r\r CONCLUDING NOTE.\r\r\r\r\rIllustrations\r\r HOMER INVO'
Target data: '.\r BOOK XXI.\r BOOK XXII.\r BOOK XXIII.\r BOOK XXIV.\r\r CONCLUDING NOTE.\r\r\r\r\rIllustrations\r\r HOMER INVOK'


In [10]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 28 ('X')
  expected output: 4 ('.')
Step    1
  input: 4 ('.')
  expected output: 0 ('\r')
Step    2
  input: 0 ('\r')
  expected output: 1 (' ')
Step    3
  input: 1 (' ')
  expected output: 6 ('B')
Step    4
  input: 6 ('B')
  expected output: 19 ('O')


In [11]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024
rnn_units_2 = 512

In [12]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[BATCH_SIZE, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),  
  tf.keras.layers.Dense(vocab_size)
])
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (128, None, 256)          14592     
                                                                 
 gru (GRU)                   (128, None, 1024)         3938304   
                                                                 
 gru_1 (GRU)                 (128, None, 512)          2362368   
                                                                 
 dense (Dense)               (128, None, 57)           29241     
                                                                 
Total params: 6344505 (24.20 MB)
Trainable params: 6344505 (24.20 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)




In [14]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [15]:
history = model.fit(dataset, epochs=200, callbacks=[checkpoint_callback])

Epoch 1/200



Epoch 2/200

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xcd in position 327: invalid continuation byte

In [None]:
def generate_text(model, start_string, temp, gen_chars):     
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)  
  text_generated = []
  model.reset_states()
  for i in range(gen_chars):
    predictions = model(input_eval)      
    predictions = tf.squeeze(predictions, 0)
    predictions = predictions / temp
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    input_eval = tf.expand_dims([predicted_id], 0)
    text_generated.append(idx2char[predicted_id])  
  return (start_string + ''.join(text_generated))

In [None]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[1, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.Dense(vocab_size)
])
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim,
                            batch_input_shape=[1, None]),
  tf.keras.layers.GRU(rnn_units,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'),
  tf.keras.layers.GRU(rnn_units_2,
                      return_sequences=True,
                      stateful=True,
                      recurrent_initializer='glorot_uniform'), 
  tf.keras.layers.Dense(vocab_size)
])
model.summary()
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (1, None, 256)            14592     
                                                                 
 gru_50 (GRU)                (1, None, 1024)           3938304   
                                                                 
 gru_51 (GRU)                (1, None, 512)            2362368   
                                                                 
 dense_25 (Dense)            (1, None, 57)             29241     
                                                                 
Total params: 6,344,505
Trainable params: 6,344,505
Non-trainable params: 0
_________________________________________________________________


In [None]:
generate_text(model, u"Throgh death", 1.0, 200)

'Throgh death, eternal shame.\r\rHis generous steeds he grinds his royal guest\rNo less the rage of all their hosts.\r\rWhat godlike coursers fed his eyes\rOn the cold towers of swiftness in the race.\rA wellfed ox the l'

In [None]:
# model.save("NLP_gen_illiad_200")





INFO:tensorflow:Assets written to: NLP_gen_illiad_200/assets


INFO:tensorflow:Assets written to: NLP_gen_illiad_200/assets
