# Natural Language Processing
Start with text from Shakespeare.

In [1]:
import tensorflow as tf
tf.keras.backend.set_floatx('float32')
from tensorflow import keras
import numpy as np

# Shakespeare
surl="https://homl.info/shakespeare"
filepath=keras.utils.get_file("shakespeare.txt",surl)
with open(filepath) as fp:
    stext = fp.read()

## The keras tokenizer

In [2]:
print("Text len = %d"%len(stext))
print(stext[:50])

Text len = 1115394
First Citizen:
Before we proceed any further, hear


In [3]:
print("Given array of string, tokenizer chops to words")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False,lower=True)
text_as_array=[stext]
tokenizer.fit_on_texts(text_as_array)   # this is fast and word-leel
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])

Given array of string, tokenizer chops to words
Num encoded words = 12632
Encoding = dict_items([('the', 1), ('and', 2), ('to', 3), ('i


In [4]:
print("Given a string, tokenizer chops to chars regardless of parameter...")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=False,lower=True)
tokenizer.fit_on_texts(stext)  
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])
# This is slow.
# At word level, tokenizer filters non-word characters.

Given a string, tokenizer chops to chars regardless of parameter...
Num encoded words = 28
Encoding = dict_items([('e', 1), ('t', 2), ('o', 3), ('a', 4)


In [5]:
print("Given array of string and char_level param tokenizer chops to chars")
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True,lower=True)
text_as_array=[stext] 
tokenizer.fit_on_texts(text_as_array) 
encoding = tokenizer.word_index.items()
print("Num encoded words = "+str(len(encoding)))
print("Encoding = "+str(encoding)[:50])
# This is fast.
# At char level, tokenzer leaves non-word characters.

Given array of string and char_level param tokenizer chops to chars
Num encoded words = 39
Encoding = dict_items([(' ', 1), ('e', 2), ('t', 3), ('o', 4)


In [6]:
# Here are different ways of retrieving the encoded sequence.
array_of_seq = tokenizer.texts_to_sequences(text_as_array)
print("%d total sequences"%len(array_of_seq))
print("%d array[0] len"%len(array_of_seq[0]))
[sequence] = tokenizer.texts_to_sequences(text_as_array)
print("%d seq len"%len(sequence))
print(stext[:10])
print(sequence[:10])

1 total sequences
1115394 array[0] len
1115394 seq len
First Citi
[20, 6, 9, 8, 3, 1, 19, 6, 3, 6]


In [7]:
# If we want encoding to start at 0,
# this doesn't work on python list: encoded=sequence-1
# but it does work in numpy.
encoded=np.array(sequence)-1
print(encoded[:10])

[19  5  8  7  2  0 18  5  2  5]


In [8]:
# Save this for later
max_code = len(tokenizer.word_index)
max_code

39

## Train, validate, test sets
Convert one long list of (encoded) characters
to many training instances using tensor window function.

In [9]:
# Use TensorFlow Dataset.
# In addition to from_tensor_slices(), it has methods
# enumerate, filter, shuffle, skip, zip, interleave, apply, ...

data_size = len(encoded)
train_size = int(0.9 * data_size)
train_np = encoded[:train_size]

# Stackoverflow clarifies what this method does.
# from_tensors() combines inputs, dataset contains a single element.
# from_tensor_slices() dataset contains separate element for each input row.
# The name really should be to_tensor_slices()
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
print(dataset)
# Known issue: this special kind of dataset is missing many methods
# that other tensors have: cardinality(), to_numpy(), ...
# Note: shape () means list of scalar.

<TensorSliceDataset shapes: (), types: tf.int64>


In [10]:
# Create dataset of datasets, each representing a window.

n_steps=100  # max pattern the RNN can learn
window_length = n_steps+1   # include one predicted letter
# Use shift=1 because default is non-overlapping windows.
# Use drop_remainder to avoid small last window.
windows = dataset.window(window_length, shift=1, drop_remainder=True)
windows  # nested dataset of datasets

<WindowDataset shapes: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([])), types: DatasetSpec(TensorSpec(shape=(), dtype=tf.int64, name=None), TensorShape([]))>

In [11]:
# Create dataset of tensors, each representing a window.

# Call dataset.batch() on each window to create a tensor.
tensors = windows.flat_map(lambda window: window.batch(window_length))
tensors    # dataset of tensors
# Note: shape (None,) means unknown dimensions.

<FlatMapDataset shapes: (None,), types: tf.int64>

In [12]:
# Consecutive overlapping windows are highly correlated!
# This will bias gradient descent.
# Solution is shuffle.

batch_size=32
buffer_size=10000
shuffled = tensors.shuffle(buffer_size).batch(batch_size)
shuffled  # Special Dataset called Batch

<BatchDataset shapes: (None, None), types: tf.int64>

In [13]:
# Use colons so e.g. X = 0-9 while y = 1-10.
mapped = shuffled.map(lambda win: (win[:,:-1], win[:,1:]))
mapped   # Special Dataset called Map

<MapDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>

In [14]:
# Next use encoding or embedding.
encoded = mapped.map(lambda X,y: (tf.one_hot(X,depth=max_code),y))
encoded

<MapDataset shapes: ((None, None, 39), (None, None)), types: (tf.float32, tf.int64)>

## Compile and train a model
This takes hours.

In [15]:
# TimeDistributed: apply a layer to each temporal slice.
# GRU: the simpler LSTM using two gates.
rnn = keras.models.Sequential([
    keras.layers.GRU(128,return_sequences=True,
                     input_shape=[None,max_code],
                    dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128,return_sequences=True,
                     input_shape=[None,max_code],
                    dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(
        keras.layers.Dense(max_code,activation="softmax"))
])
rnn.compile(loss="sparse_categorical_crossentropy",optimizer="adam")

In [None]:
# Book says this takes many hours.
# On my laptop, this needs 3 hours per epoch.
spe = train_size//batch_size
history=rnn.fit(encoded,epochs=5,steps_per_epoch=spe)

Epoch 1/5
    7/31370 [..............................] - ETA: 2:47:45 - loss: 2.2382