## Natural language Processing with RNNs and Attention

In [1]:
import tensorflow as tf

In [2]:
from pathlib import Path
import os
import matplotlib.pyplot as plt 


#Current working directory - 'd:\\Projects\\HandsOnML'
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "Ch16-NLP-With-Attention"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### Generating Shakespearean Text Using a Character RNN

In [1]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [2]:
print(shakespeare_text[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [3]:
#Checking all unique characters in the data
unique_data = "".join(sorted(set(shakespeare_text.lower())))
print(unique_data)
print(f"Length of unique data: {len(unique_data)}")


 !$&',-.3:;?abcdefghijklmnopqrstuvwxyz
Length of unique data: 39


Using keras text vectorizer to create the vocab

In [10]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

The above process maps each character to an integer starting at 2. 0 is reserved for padding and 1 is for unknown characters.

In [13]:
encoded -= 2 #We will not use padding and UNK now
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

In [32]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

The `to_dataset` function is used to convert a seqence of character ID into a dataset of input and target characters.
It takes the sequence of ID as input and creates a dataset. Each input record will be of length given by the parameter `length`. The corresponding target output will be the sequence shifted to right by one character. (Same length)


Each of the 39 characters have been assigned an index. A given sentence can be converted into a corresponding index form. 

In [30]:
a = text_vec_layer(["How are you?"])[0]
print(a)

tf.Tensor([ 8  5 18  2  6 10  3  2 17  5 15 31], shape=(12,), dtype=int64)


The above tensor contains values - 8, 5, 18, 2, 6, 10, 3, 2, 17, 5, 15, 31

h -> 8 <br>
o -> 5 <br>
w -> 18 <br>
' ' -> 2 <br>
a -> 6 <br>
r -> 10 <br>
e -> 3 <br>
' ' -> 2 <br>
y -> 17 <br>
o -> 5 <br>
u -> 15 <br>
? -> 31 <br>

For example: If the above sequence is given as input with length 4. We get the following dataset. <br>

1st record <br>
I/P -> 8,5,18,2 <br>
O/P -> 5,18,2,6 <br><br>

2nd record <br>
I/P -> 5,18,2,6 <br>
O/P -> 18,2,6,10 <br><br>

3rd record <br>
I/P -> 18,2,6,10 <br>
O/P -> 2,6,10,3 <br><br>

And so on.. <br>
Let's verify the above records

In [38]:
data = list(to_dataset(text_vec_layer(["How are you?"])[0], length=4))

In [47]:
#Input is stored in data[0][0][i] and corresponding output in data[0][1][i]
for i in range(0,3):
    print(f"Record no : {i+1}")
    print(f"Input: {data[0][0][i]} -> output: {data[0][1][i]}")

Record no : 1
Input: [ 8  5 18  2] -> output: [ 5 18  2  6]
Record no : 2
Input: [ 5 18  2  6] -> output: [18  2  6 10]
Record no : 3
Input: [18  2  6 10] -> output: [ 2  6 10  3]


These records are same as in the example above.

In [48]:
#Creating train, test and validation set
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)
test_set = to_dataset(encoded[1_060_000:], length=length)

### Modelling

In [51]:
tf.random.set_seed(42)  # extra code – ensures reproducibility on CPU
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.Dense(n_tokens, activation="softmax")
])

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam",
              metrics=["accuracy"])

model_ckpt = tf.keras.callbacks.ModelCheckpoint(
    "../Models/my_shakespeare_model", monitor="val_accuracy", save_best_only=True)

history = model.fit(train_set, validation_data=valid_set, epochs=1,
                    callbacks=[model_ckpt])

The model has been trained. We'll now create a pipeline which will do the following: <br>
1. Convert the characters to indexes
2. Remove indexes for padding and UNK characters
3. Predict the output

In [53]:
shakespeare_model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Lambda(lambda X: X - 2),  # no <PAD> or <UNK> tokens
    model
])

Sample prediction

We'll predict the next character from 'To be or not to b'.

In [54]:
y_proba = shakespeare_model.predict(["To be or not to b"])[0, -1]
y_pred = tf.argmax(y_proba)  # choose the most probable character ID
text_vec_layer.get_vocabulary()[y_pred + 2]



'e'

### Generating Fake Shakespearean Text

In [71]:
#Probability of prediction of class 0 is 50%, class 1 is 40% and class 2 is 10% 
log_probas = tf.math.log([[0.5, 0.4, 0.1]])  

tf.random.set_seed(42)

#Predicting 8 samples based on the above probablity
tf.random.categorical(log_probas, num_samples=8)  # draw 8 samples

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]], dtype=int64)>

In [62]:
text = "to be or not to be"

In [57]:
def next_char(text, temperature=1):
    y_proba = shakespeare_model.predict([text])[0, -1:]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]
    return text_vec_layer.get_vocabulary()[char_id + 2]

In [58]:
def extend_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [59]:
print(extend_text("To be or not to be", temperature=0.01))

To be or not to be so love,
and therefore shall be so love her the d


In [60]:
print(extend_text("To be or not to be", temperature=1))

To be or not to be,
overde upon,
o, thretive in betake the headd you


In [61]:
print(extend_text("To be or not to be", temperature=100))

To be or not to be ,yt'&o3g:auy-$
wh-nse?pws&ertj-vgerdjw!c-yjewznqx
