# Preparing the Data

## Creating the Training Dataset

In [1]:
import tensorflow as tf
shakespeare_url = 'https://homl.info/shakspeare'
filepath = tf.keras.utils.get_file('shakespeare.txt',shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

In [2]:
tf.__version__

'2.4.1'

## Tokenizer (Encode every Character as an integer)


In [16]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespeare_text])

In [17]:
# Generate character IDs from chars
tokenizer.texts_to_sequences(['First'])

[[20, 6, 9, 8, 3]]

In [18]:
# Generate chars from character IDs
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [19]:
# Checking the number of distict Characters 
max_id = len(tokenizer.word_index)
max_id

39

In [20]:
# Total number of characters
dataset_size = tokenizer.document_count
dataset_size

1

In [21]:
# Encoding the full text
import numpy as np
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # Substract 1 get IDs from 0 t0 38 rather than from 1 to 39
encoded

array([19,  5,  8, ..., 20, 26, 10])

## Splitting the data

In [22]:
# Creating training dataset
train_size = dataset_size*90//100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [23]:
# Creating windows to convert the dataset sequence of characters into many smaller windows of text
# ( Truncated back propogation through time)

n_steps = 100 # Length of the patterns the model will learn
window_length = n_steps+1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length,shift=1,drop_remainder=True) # Use shift 1 so the first window contains chars 0 to 100, the second 1 to 101. Also use drop_remainder to make sure all windows are exactly 101 chars long, else the last one can be 1, 99 or any other remainder. 

In [24]:
# Flattening our nested dataset
dataset = dataset.flat_map(lambda window: window.batch(window_length)) # This will create a flat datset containing window sized tensors but in this case since all the windows have the same length, we will get a single tensor for each of them.

In [25]:
# Shuffle the windows
batch_size = 32
datsset = dataset.shuffle(10000).batch(batch_size)

In [26]:
dataset = dataset.map(lambda windows: (windows[:-1],windows[1:])) # The book contains two dimensions but since it threw an error I kept it as 1D (windows[:,:-1],windows[:,1:])

## Encode Target

In [27]:
dataset = dataset.map(lambda X_batch,Y_batch: (tf.one_hot(X_batch,depth=max_id),Y_batch))

In [28]:
# Add prefetching
dataset = dataset.prefetch(1)

# Building and Training the Char-RNN Model

In [30]:
# Built the model
model_0 = tf.keras.Sequential([
    tf.keras.layers.GRU(128,return_sequences=True,input_shape=[None,max_id],
                     dropout=0.2,recurrent_dropout=0.2),
    tf.keras.layers.GRU(128,return_sequences=True,
                        dropout=0.2,recurrent_dropout=0.2),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,activation='softmax'))

])

# Compile the model
model_0.compile(loss = tf.keras.losses.SparseCategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ['accuracy'])

# Fit the model
history_0 = model_0.fit(dataset,epochs=20)

Epoch 1/20


ValueError: ignored