Given a character or sequence of characters, we want to predict the next character at each time step.
Model is trained to follow a language similar to the works of Shakespeare. The tinyshakespear dataset is used for training.

In [80]:
import numpy as np
import io
import re
import tensorflow as tf
import time
import os

# Get data

In [62]:
def read_text(URL):
    with io.open(URL, 'r', encoding='utf8') as f:
        text = f.read()
    # Character's collection
    return text

In [63]:
# test
print(read_text("shakespeare_train.txt")[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# Preprocessing

In [64]:
#character to index
def character_to_index(sub_txt, dict_int):
    encoded_text = np.array([dict_int[c] for c in sub_txt], dtype=np.int32)  # encode data
    return encoded_text

In [65]:
#index to character
def index_to_char(index_list, dict_char):
    text = []
    for i in index_list:
        text.append(dict_char[i])
    return (repr( ''.join(text)))

In [66]:
dict_int = {u:i for i, u in enumerate(read_text("shakespeare_train.txt")[:100])}
dict_char = dict(enumerate(read_text("shakespeare_train.txt")[:100]))

In [67]:
# test
print("Character to Index: \n")
for char,_ in zip(dict_int, range(65)):
    print('  {:4s}: {:3d}'.format(repr(char), dict_int[char]))

Character to Index: 

  'F' :  82
  'i' :  91
  'r' :  84
  's' :  85
  't' :  90
  ' ' :  87
  'C' :  88
  'z' :  92
  'e' :  93
  'n' :  94
  ':' :  95
  '\n':  96
  'B' :  15
  'f' :  37
  'o' :  98
  'w' :  22
  'p' :  75
  'c' :  28
  'd' :  31
  'a' :  77
  'y' :  35
  'u' :  99
  'h' :  46
  ',' :  72
  'm' :  51
  'k' :  78
  '.' :  79
  'A' :  62
  'l' :  64
  'S' :  67
  'Y' :  97


# Create training examples / targets

Target value: for each sequence of characters, we return that sequence, shifted one position to the right, along with the new character that is predicted to follow the sequence.

To create training examples of (input, target) pairs, we take the given sequence. The input is sequence with last word removed. Target is sequence with first word removed. Example: sequence: abc d ef input: abc d e target: bc d ef

In [68]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [69]:
# Create training examples / targets
def handle_data(data, seq_len):
    """
    this function to create data from row data

    :param data: row data with int type
    :param seq_len: max len of input and output sequence
    :return: data for training
    """
    #data4epoch = len(data) // (seq_len+1)
    # Create training examples / targets
    char_dataset = tf.data.Dataset.from_tensor_slices(data)
    sequences = char_dataset.batch(seq_len + 1, drop_remainder=True)
    dataset = sequences.map(split_input_target)
    return dataset

In [70]:
training_set = read_text('shakespeare_train.txt')
val_set = read_text('shakespeare_valid.txt')

In [71]:
#union vocab
vocab_train = set(training_set)
vocab_val = set(val_set)
vocab = vocab_train.union(vocab_val)

In [72]:
# set character that were found in text to the dict
dict_int = {u:i for i, u in enumerate(vocab)}
dict_char =dict(enumerate(vocab))

train_x = character_to_index(training_set, dict_int)
val_x = character_to_index(val_set, dict_int)

seq_len = 50 # max number of characters that can be fed as a single input

#Create sequences from the individual characters. Our required size will be seq_len + 1 (character RNN)
train_seq = len(train_x) // (seq_len + 1)
val_seq = len(val_x) // (seq_len + 1)

data_train = handle_data(train_x, seq_len) # include input and target
data_val = handle_data(val_x, seq_len)  # include input and target

# Build model

In [73]:
BATCH_SIZE = 64
iterator_train = train_seq // BATCH_SIZE
iterator_val = val_seq // BATCH_SIZE

# Buffer used to shuffle the dataset
BUFFER_SIZE = train_seq + val_seq
data_train = data_train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
data_val = data_val.batch(BATCH_SIZE, drop_remainder=True)

In [74]:
def built_model(cellType, vocab_size, embedding_dim, rnn_units, BATCH_SIZE):
    if (cellType == "LSTM"):
        rnn = tf.keras.layers.LSTM
    elif (cellType == "GRU"):
        rnn = tf.keras.layers.GRU
    else:
        rnn = tf.keras.layers.SimpleRNN

    model = tf.keras.Sequential([tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=[BATCH_SIZE, None]),
                                 rnn(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
                                 tf.keras.layers.Dense(vocab_size)])
    return model 

In [75]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [76]:
cellType  = "LSTM" 
model = built_model(cellType, vocab_size, embedding_dim, rnn_units, BATCH_SIZE)
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 256)           17152     
_________________________________________________________________
lstm_4 (LSTM)                (64, None, 1024)          5246976   
_________________________________________________________________
dense_4 (Dense)              (64, None, 67)            68675     
Total params: 5,332,803
Trainable params: 5,332,803
Non-trainable params: 0
_________________________________________________________________
None


# Training model

In [77]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [78]:
model.compile(optimizer='adam', loss=loss)

In [81]:
lstm_dir_checkpoints= 'training_checkpoints_LSTM'
checkpoint_prefix = os.path.join(lstm_dir_checkpoints, "chkpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,save_weights_only=True)

In [82]:
EPOCHS=100
history = model.fit(data_train, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

KeyboardInterrupt: 

# Predicting