Copyright 2019 Almintas Povilaitis

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.



<table class="tfo-notebook-buttons" align="left">
<td>
<a target="_blank"  href="https://colab.research.google.com/github/mlai-demo/TextGen-tf2/blob/master/TextGen_tf2pub.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
</td><td>
<a target="_blank"  href="https://github.com/mlai-demo/TextGen-tf2/blob/master/TextGen_tf2pub.ipynb"><img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td></table>

## Prep work

### Download relevant libraries and check the setup

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

!pip install tensorflow-gpu==2.0.0-alpha0
import tensorflow as tf

import numpy as np
import os
import datetime

In [None]:
print("TensorFlow version: ", tf.__version__)

Check if GPU is available - always good to double-check. When using Colab, I sometimes forget to change runtime type, so having this  code will always catch it.

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

If need to remove logs from previous runs, uncomment and adjust the directory name:

In [None]:
#!rm -rf ./checkpoints_2019.04.21-20:48:58/ #if using Tensorboard or other logging

### Download the dataset

Check the current directory and upload the text file:

In [None]:
import os
path = os.getcwd()
print(path)

In [None]:
# if using Google Colab
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Click Files tab - the updload file(s) will be there

In case you have multiple files that need to be merged:

In [None]:
#If using a directory with multiple files
import glob
import codecs
books = sorted(glob.glob(path + "/*.txt"))
print("Found {} books".format(len(books)))

text = ""
for filename in books:
    with codecs.open(filename, 'r', 'utf-8') as books:
        text += books.read()

print("Text is {} characters long".format(len(text)))

In [None]:
#If using a single file
text = open(path + '/Iliad_v3.txt', 'rb').read().decode(encoding='utf-8')
print("Text is {} characters long".format(len(text)))

In [None]:
words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print("Text is {} words long".format(len(words)))

Make sure the text sample is what you expected:

In [None]:
print(text[:100])

## Prepare the text

In [None]:
#Map unique characters to indices
vocab = sorted(set(text))
print ('There are {} unique characters'.format(len(vocab)))
char2int = {c:i for i, c in enumerate(vocab)}
int2char = np.array(vocab)
print('Vector:\n')
for char,_ in zip(char2int, range(len(vocab))):
   print(' {:4s}: {:3d},'.format(repr(char), char2int[char]))

In [None]:
text_as_int = np.array([char2int[ch] for ch in text], dtype=np.int32)
print ('{}\n mapped to integers:\n {}'.format(repr(text[:100]), text_as_int[:100]))

In [None]:
tr_text = text_as_int[:704000] #text separated for training, divisible by the batch size (64)
val_text = text_as_int[704000:] #text separated for validation

Comfirm the shapes are what we expect:

In [None]:
print(text_as_int.shape, tr_text.shape, val_text.shape)

## Build the model

In [None]:
# Populate the library of tunables - I like keeping the centralized in case I need to change things around:
batch_size = 64
buffer_size = 10000
embedding_dim = 256
epochs = 50
seq_length = 200
examples_per_epoch = len(text)//seq_length
#lr = 0.001 #will use default for Adam optimizer
rnn_units = 1024
vocab_size = len(vocab)

In [None]:
tr_char_dataset = tf.data.Dataset.from_tensor_slices(tr_text)
val_char_dataset = tf.data.Dataset.from_tensor_slices(val_text)
print(tr_char_dataset, val_char_dataset)
tr_sequences = tr_char_dataset.batch(seq_length+1, drop_remainder=True)
val_sequences = val_char_dataset.batch(seq_length+1, drop_remainder=True)
print(tr_sequences, val_sequences)

for item in tr_sequences.take(1):
    print(repr(''.join(int2char[item.numpy()])))
    print(item)
for item in val_sequences.take(1):
    print(repr(''.join(int2char[item.numpy()])))
    print(item)

In [None]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

tr_dataset = tr_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
val_dataset = val_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
print(tr_dataset, val_dataset)

In [None]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dropout(0.2), 
        tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

In [None]:
model = build_model(
    vocab_size = len(vocab),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
    batch_size=batch_size)

## Run the model

Check the output shape

In [None]:
for input_example_batch, target_example_batch in tr_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "respectively: batch_size, sequence_length, vocab_size")

In [None]:
model.summary()

Untrained model output:

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
print("Input: \n", repr("".join(int2char[input_example_batch[0]])))
print()
print("Predictions: \n", repr("".join(int2char[sampled_indices ])))

In [None]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
def accuracy(labels, logits):
    return tf.keras.metrics.sparse_categorical_accuracy(labels, logits)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
example_batch_acc  = accuracy(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Loss:      ", example_batch_loss.numpy().mean())
print("Accuracy:      ", example_batch_acc.numpy().mean())

In [None]:
optimizer = tf.keras.optimizers.Adam() 
model.compile(optimizer=optimizer, loss=loss) 

In [None]:
patience = 10
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './checkpoints'+ datetime.datetime.now().strftime("_%Y.%m.%d-%H:%M:%S")
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [None]:
history = model.fit(tr_dataset, epochs=epochs, callbacks=[checkpoint_callback, early_stop] , validation_data=val_dataset)
print ("Training stopped as there was no improvement after {} epochs".format(patience))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,9))
plt.plot(history.history['loss'], 'g')
plt.plot(history.history['val_loss'], 'rx') #use if have val data
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper right')
plt.legend(['Train', 'Validation'], loc='upper right') #use if have val date
plt.show()

## Generate text

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
#model.load_weights('./checkpoints_2019.04.29-00:31:15/ckpt_17')  #if the latest checkpoint is not your preferred
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))  #if the latest checkpoint is what you want
model.build(tf.TensorShape([1, None]))
model.summary()

In [None]:
def generate_text(model, start_string):
    
    print('Generating with seed: "' + start_string + '"')
  
    num_generate = 1000

    input_eval = [char2int[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    temperature = 1.0

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(int2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [None]:
print(generate_text(model, start_string="joy of gods"))

In [None]:
with open('sampleTF2.txt', 'w') as f:
    sampleTF2 = generate_text(model, start_string="joy of gods")
    f.write(sampleTF2)

Free memory resources if needed:

In [None]:
import signal

os.kill(os.getpid(), signal.SIGKILL)