<a href="https://colab.research.google.com/github/TamTran72111/learn-ml-dl/blob/master/Augelien_Geron_book/16_Natural_Language_Processing_with_RNNs_and_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%matplotlib inline
from tensorflow import keras
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import os

# Generating Shakespearean Text Using a Character RNN

### Creating the Training DataSet

In [2]:
shakespeare_url = 'https://homl.info/shakespeare'
filepath = keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

In [3]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [4]:
tokenizer.texts_to_sequences(['First'])

[[20, 6, 9, 8, 3]]

In [5]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [6]:
max_id = len(tokenizer.word_index)    # number of distinct characters
max_id

39

In [7]:
dataset_size = tokenizer.document_count # total number of characters

In [8]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

In [9]:
# Split training datasets
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [10]:
# Chopping the sequence dataset into multiple windows
n_steps = 100
window_length = n_steps + 1     # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [11]:
for item in dataset.take(5):
  print(item)

<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>
<_VariantDataset shapes: (), types: tf.int64>


In [12]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [13]:
for item in dataset.take(5):
  print(item)

tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)
tf.Tensor(
[ 5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1  0
 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1  4
  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24 17
  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10
 15  3 13  0  4], shape=(101,), dtype=int64)
tf.Tensor(
[ 8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1  0 22
  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1  4  8
  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24 17  0
  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 15
 

In [14]:
batch_size = 32
dataset = dataset.shuffle(10_000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [15]:
# Encode each character using a one-hot vector,
# since there are fairly few distinct characters
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

In [16]:
dataset = dataset.prefetch(1)

In [17]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


### Building and Training the Char-RNN model

In [25]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation='softmax'))
])



In [26]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
history = model.fit(dataset, steps_per_epoch=train_size // batch_size, epochs=1)

 3570/31370 [==>...........................] - ETA: 4:35:38 - loss: 1.7377

KeyboardInterrupt: ignored

### Using the Char-RNN Model

In [27]:
def preprocess(texts):
  X = np.array(tokenizer.texts_to_sequences(texts)) - 1
  return tf.one_hot(X, max_id)

In [28]:
X_new = preprocess(['How are yo'])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentences, last char

'u'

In [29]:
def next_char(text, temperature=1):
  X_new = preprocess([text])
  y_proba = model.predict(X_new)[0, -1:, :]
  rescaled_logits = tf.math.log(y_proba) / temperature
  char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
  return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [30]:
def complete_text(text, n_chars=50, temperature=1):
  for _ in range(n_chars):
    text += next_char(text, temperature)
  return text

In [31]:
complete_text('t', temperature=0.2)



'the senators, and so man i have so marcius and the '

In [32]:
complete_text('w')

'ward the people but him. come.\n\ncominius:\nwhenk thy'

In [33]:
complete_text('w', temperature=2)

"w?:\nyou' bring\nmaker rows! scy-mr.ayid yood with ro"

### Stateful RNN

In [34]:
def create_dataset(train_data):
  dataset = tf.data.Dataset.from_tensor_slices(train_data)
  dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_length))
  dataset = dataset.batch(1)
  dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
  dataset = dataset.map(
      lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
  )
  return dataset.prefetch(1)

In [35]:
dataset = create_dataset(encoded[:train_size])

In [36]:
# Chopping the text into 32 texts of equal length
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
  dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
  dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
  dataset = dataset.flat_map(lambda window: window.batch(window_length))
  datasets.append(dataset)

dataset = tf.data.Dataset.zip(tuple(datasets)).map(
    lambda *windows: tf.stack(windows)
)
dataset = dataset.repeat().map(
    lambda windows: (windows[:, :-1], windows[:, 1:])
)
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset = dataset.prefetch(1)

In [37]:
model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2,
                     batch_input_shape=[batch_size, None, max_id]),
    keras.layers.GRU(128, return_sequences=True, stateful=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))
])



In [38]:
# Reset state before going back to the beginning of the text
class ResetStatesCallback(keras.callbacks.Callback):
  def on_epoch_begin(self, epoch, logs):
    self.model.reset_states()

In [39]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
steps_per_epoch = train_size // batch_size //n_steps
history = model.fit(dataset, steps_per_epoch=steps_per_epoch,
                    epochs=50, callbacks=[ResetStatesCallback()])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
# Create a stateless model from the trained model to use with different batch sizes
stateless_model = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id, activation='softmax'))
])

In [41]:
# To set the wieghts, the model needs to be built first
stateless_model.build(tf.TensorShape([None, None, max_id]))

In [42]:
stateless_model.set_weights(model.get_weights())

model = stateless_model

In [43]:
complete_text('t')



't:\nye from paris temped him, if you welm heaven: gu'

# Sentiment Analysis

In [44]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [45]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(('<pad>', '<sos>', '<unk>')):
  id_to_word[id_] = token  

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [46]:
' '.join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [47]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)
train_size = info.splits['train'].num_examples

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTCMDA8/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTCMDA8/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteTCMDA8/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [48]:
def preprocess(X_batch, y_batch):
  X_batch = tf.strings.substr(X_batch, 0, 300)
  X_batch = tf.strings.regex_replace(X_batch, b'<br\\s*/?>', b' ')
  X_batch = tf.strings.regex_replace(X_batch, b'[^a-zA-Z]', b' ')
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b'pad'), y_batch

In [49]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets['train'].batch(32).map(preprocess):
  for review in X_batch:
    vocabulary.update(list(review.numpy()))

In [50]:
vocabulary.most_common()[:3]

[(b'pad', 224503), (b'the', 61156), (b'a', 38569)]

In [51]:
vocab_size = 10_000
truncated_vocabulary = [
  word for word, count in vocabulary.most_common()[:vocab_size]
]

In [55]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [56]:
table.lookup(tf.constant([b'This movie was faaaaaaaantastic'.split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   24,    12,    13, 10552]])>

In [57]:
def encode_words(X_batch, y_batch):
  return table.lookup(X_batch), y_batch

train_set = datasets['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [58]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation='sigmoid')
])

In [59]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

Epoch 1/5
Epoch 2/5




  1/781 [..............................] - 0s 48ms/step - loss: 0.5114 - accuracy: 0.6250


In [61]:
# Using manual masking
embed_size = 128
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: keras.backend.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, activation='sigmoid')(z)

model = keras.models.Model(inputs=[inputs], outputs=[outputs])
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)

Epoch 1/5
Epoch 2/5




  1/781 [..............................] - 0s 10ms/step - loss: 0.5409 - accuracy: 0.7500


### Reusing Pretrained Embeddings

In [62]:
TFHUB_CACHE_DIR = os.path.join(os.curdir, "my_tfhub_cache")
os.environ["TFHUB_CACHE_DIR"] = TFHUB_CACHE_DIR

In [64]:
import tensorflow_hub as hub

model = keras.Sequential([
  hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1',
                 dtype=tf.string, input_shape=[], output_shape=[50]),
  keras.layers.Dense(128, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])

In [65]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
batch_size = 32
train_set = datasets['train'].repeat().batch(batch_size).prefetch(1)

history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# An Encoder–Decoder Network for Neural Machine Translation

In [70]:
import tensorflow_addons as tfa

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)

encoder = keras.layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(512)
output_layer = keras.layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                 sampler,
                                                 output_layer=output_layer)

final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state,
    sequence_length=sequence_lengths
)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)

model = keras.Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths],
                    outputs=[Y_proba])

In [71]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")

### Bidirectional RNNs

In [72]:
model = keras.models.Sequential([
    keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),
    keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))
])

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_12 (GRU)                 (None, None, 10)          660       
_________________________________________________________________
bidirectional (Bidirectional (None, None, 20)          1320      
Total params: 1,980
Trainable params: 1,980
Non-trainable params: 0
_________________________________________________________________


### Beam Search

In [75]:
# beam_width = 10
# sos_id = 42
# decoder = tfa.seq2seq.beam_search_decoder.BeamSearchDecoder(
#     cell=decoder_cell, beam_width=beam_width, output_layer=output_layer
# )
# decoder_initial_state = tfa.seq2seq.beam_search_decoder.tile_batch(
#     encoder_state, multiplier=beam_width
# )

# start_tokens = tf.fill(dims=batch_size, value=sos_id)

# outputs, _, _ =decoder(
#     embedding_encoder, start_tokens=start_tokens, end_token=0,
#     initial_state=decoder_initial_state
# )

# Attention Mechanisms

In [79]:

# # Luong attention (multiplicative attention)
# attention_mechanism = tfa.seq2seq.attention_wrapper.LuongAttention(
#     units, encoder_state, memory_sequence_length=encoder_sequence_length
# )

# attention_decoder_cell = tfa.seq2seq.attention_wrapper.AttentionWrapper(
#     decoder_cell, attention_mechanism, attention_layer_size=n_units
# )

### Positional Encodings

\begin{equation*}
P_{p,2i}=sin \left(\frac{p}{10000^{\frac{2i}{d}}} \right) \\
P_{p,2i+1}=cos \left(\frac{p}{10000^{\frac{2i}{d}}} \right)
\end{equation*}

$P_{p,i}$ is the $i^{th}$ component of the embedding for the word located at the $p^{th}$ position in the sentence

In [117]:
class PositionalEncoding(keras.layers.Layer):
  def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
    super().__init__(dtype=dtype, **kwargs)
    if max_dims % 2 == 1:
      max_dims += 1   # max_dims must be even
    p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
    pos_emb = np.empty((1, max_steps, max_dims))
    pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (2 * i / max_dims)).T
    pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (2 * i / max_dims)).T
    self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))

  def call(self, inputs):
    shape = tf.shape(inputs)
    return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]

In [118]:
embed_size = 512
max_steps = 500
vocab_size = 10_000

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
embeddings = keras.layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embeddings(encoder_inputs)
decoder_embeddings = embeddings(decoder_inputs)
positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)
encoder_in = positional_encoding(encoder_embeddings)
decoder_in = positional_encoding(decoder_embeddings)

### Multi-Head Attention

*Scaled Dot-Product Attention*

\begin{equation*}
Attention(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = softmax \left(\frac{\mathbf{Q}\mathbf{K}^T}{\sqrt{d_{keys}}} \right)\mathbf{V}
\end{equation*}

$\mathbf{Q}$ is a matrix containing one row per query. Its shape is $[n_{queries}, d_{keys}]$, where nqueries is the number of queries and dkeys is the number of dimensions of each query and each key.

$\mathbf{K}$ is a matrix containing one row per key. Its shape is $[n_{keys}, d_{keys}]$, where nkeys is the number of keys and values.

$\mathbf{V}$ is a matrix containing one row per value. Its shape is $[n_{keys}, d_{values}]$, where dvalues is the number of dimensions of each value.

# Exercises

In [100]:
# Question 8:
default_reber_grammar = [
  [('B', 1)],                 # state 0 = B => state 1
  [('T', 2), ('P', 3)],       # state 1 = T => state 2 or = P => state 3
  [('S', 2), ('X', 4)],
  [('T', 3), ('V', 5)],
  [("X", 3), ("S", 6)],
  [("P", 4), ("V", 6)],
  [("E", None)]
]

In [101]:
embedded_reber_grammar = [
  [("B", 1)],
  [("T", 2), ("P", 3)],
  [(default_reber_grammar, 4)],
  [(default_reber_grammar, 5)],
  [("T", 6)],
  [("P", 6)],
  [("E", None)]               
]

In [102]:
def generate_string(grammar):
  state = 0 
  output = []
  while state is not None:
    index = np.random.randint(len(grammar[state]))
    production, state = grammar[state][index]
    if isinstance(production, list):
      production = generate_string(grammar=production)
    output.append(production)
  return ''.join(output)

In [103]:
np.random.seed(42)

for _ in range(25):
  print(generate_string(default_reber_grammar), end=" ")

BTXXTTVPXTVPXTTVPSE BPVPSE BTXSE BPVVE BPVVE BTSXSE BPTVPXTTTVVE BPVVE BTXSE BTXXVPSE BPTTTTTTTTVVE BTXSE BPVPSE BTXSE BPTVPSE BTXXTVPSE BPVVE BPVVE BPVVE BPTTVVE BPVVE BPVVE BTXXVVE BTXXVVE BTXXVPXVVE 

In [104]:
POSSIBLE_CHARS = "BEPSTVX"

def generate_corrupted_string(grammar, chars=POSSIBLE_CHARS):
  good_string = generate_string(grammar)
  index = np.random.randint(len(good_string))
  good_char = good_string[index]
  bad_char = np.random.choice(sorted(set(chars) - set(good_char)))
  return good_string[:index] + bad_char + good_string[index + 1:]

In [105]:
np.random.seed(42)

for _ in range(25):
  print(generate_corrupted_string(embedded_reber_grammar), end=" ")

BTBPTTTPPXTVPXTTVPSETE BPBTXEEPE BPBPTVVVEPE BPBTSSSSXSETE BPTTXSEPE BTBPVPXTTTTTTEVETE BPBTXXSVEPE BSBPTTVPSETE BPBXVVEPE BEBTXSETE BPBPVPSXPE BTBPVVVETE BPBTSXSETE BPBPTTTPTTTTTVPSEPE BTBTXXTTSTVPSETE BBBTXSETE BPBTPXSEPE BPBPVPXTTTTVPXTVPXVPXTTTVVEVE BTBXXXTVPSETE BEBTSSSSSXXVPXTVVETE BTBXTTVVETE BPBTXSTPE BTBTXXTTTVPSBTE BTBTXSETX BTBTSXSSTE 

In [106]:
def string_to_ids(s, chars=POSSIBLE_CHARS):
  return [POSSIBLE_CHARS.index(c) for c in s]

In [107]:
string_to_ids('BTTTXXVEPS')

[0, 4, 4, 4, 6, 6, 5, 1, 2, 3]

In [108]:
def generate_dataset(size):
  good_strings = [string_to_ids(generate_string(embedded_reber_grammar))
                  for _ in range(size // 2)]
  bad_strings = [string_to_ids(generate_corrupted_string(embedded_reber_grammar))
                  for _ in range(size - size // 2)]
  all_strings = good_strings + bad_strings
  X= tf.ragged.constant(all_strings, ragged_rank=1)
  y = np.array([[1.] for _ in range(len(good_strings))] +
               [[0.] for _ in range(len(bad_strings))])
  return X, y

In [109]:
np.random.seed(42)

X_train, y_train = generate_dataset(10000)
X_valid, y_valid = generate_dataset(2000)

In [115]:
X_train[0]

<tf.Tensor: shape=(22,), dtype=int32, numpy=
array([0, 4, 0, 2, 4, 4, 4, 5, 2, 6, 4, 5, 2, 6, 4, 4, 5, 2, 3, 1, 4, 1],
      dtype=int32)>

In [116]:
y_train[0]

array([1.])

In [110]:
np.random.seed(42)
tf.random.set_seed(42)

In [114]:
embedding_size = 5

model = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS), output_dim=embedding_size),
    keras.layers.GRU(30),
    keras.layers.Dense(1, activation='sigmoid')
])

Epoch 1/20


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


InvalidArgumentError: ignored

In [112]:
optimizer = keras.optimizers.SGD(lr=0.2, momentum=0.95, nesterov=True)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [113]:
history = model.fit(X_train, y_train, epochs=20,
                    validation_data=(X_valid, y_valid))

Epoch 1/20


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


InvalidArgumentError: ignored

In [None]:
test_strings = ["BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE",
                "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE"]
X_test = tf.ragged.constant([string_to_ids(s) for s in test_strings], ragged_rank=1)

y_proba = model.predict(X_test)
print()
print("Estimated probability that these are Reber strings:")
for index, string in enumerate(test_strings):
  print("{}: {:.2f}%".format(string, 100 * y_proba[index][0]))

In [123]:
# Question 9:
from datetime import date

MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
  min_date = date(1000, 1 , 1).toordinal()
  max_date = date(9999, 12 , 31).toordinal()

  ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
  dates = [date.fromordinal(ordinal) for ordinal in ordinals]

  x = [MONTHS[dt.month - 1] + ' ' + dt.strftime('%d, %Y') for dt in dates]
  y = [dt.isoformat() for dt in dates]
  return x, y

In [124]:
np.random.seed(42)

n_dates = 3
x_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print("{:25s}{:25s}".format(x_example[idx], y_example[idx]))

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [125]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS)))) + "01234567890, "
INPUT_CHARS

'ADFJMNOSabceghilmnoprstuvy01234567890, '

In [126]:
OUTPUT_CHARS = '0123456789-'

In [127]:
def date_str_to_ids(date_str, chars=INPUT_CHARS):
  return [chars.index(c) for c in date_str]

In [169]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
  X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
  X = tf.ragged.constant(X_ids, ragged_rank=1)
  return (X+1).to_tensor()    # using 0 as the padding token ID

In [139]:
def create_dataset(n_dates):
  x, y = random_dates(n_dates)
  return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [146]:
np.random.seed(42)

X_train, Y_train = create_dataset(10_000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

*First version: a very basic seq2seq model*

In [147]:
embedding_size = 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation='softmax')
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

In [148]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='nadam',
              metrics=['accuracy'])

In [149]:
history = model.fit(X_train, Y_train, epochs=20,
                    validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [150]:
def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
  return [''.join([('?' + chars)[index] for index in sequence])
          for sequence in ids]

In [151]:
X_new = prepare_date_strs(["September 17, 2009", "July 14, 1789",
                           "May 02, 2020", "July 14, 1789"])

In [152]:
max_input_length = X_train.shape[1]

def prepare_date_strs_padded(date_strs):
  X = prepare_date_strs(date_strs)
  if X.shape[1] < max_input_length:
    X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
  return X

def convert_date_strs(date_strs):
  X = prepare_date_strs_padded(date_strs)
  ids = model.predict_classes(X)
  return ids_to_date_strs(ids)

In [153]:
convert_date_strs(["May 02, 2020", "July 14, 1789"])

['2020-05-02', '1789-07-14']

*Second version: feeding the shifted targets to the decoder*

In [154]:
sos_id = len(OUTPUT_CHARS) + 1 # start of sequence id

def shifted_output_sequences(Y):
  sos_tokens = tf.fill(dims=(len(Y),1), value=sos_id)
  return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

In [155]:
X_train_decoder = shifted_output_sequences(Y_train)
X_valid_decoder = shifted_output_sequences(Y_valid)
X_test_decoder = shifted_output_sequences(Y_test)

In [156]:
np.random.seed(42)
tf.random.set_seed(42)

In [158]:
encoder_embedding_size = 32
decoder_embedding_size = 32
lstm_units = 128

encoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
encoder_embedding = keras.layers.Embedding(
    input_dim=len(INPUT_CHARS)+1,
    output_dim=encoder_embedding_size
)(encoder_input)
_, encoder_state_h, encoder_state_c = keras.layers.LSTM(
    lstm_units, return_state=True
)(encoder_embedding)
encoder_state = [encoder_state_h, encoder_state_c]

decoder_input = keras.layers.Input(shape=[None], dtype=tf.int32)
decoder_embedding = keras.layers.Embedding(
    input_dim=len(OUTPUT_CHARS) + 2,
    output_dim=decoder_embedding_size
)(decoder_input)
decoder_lstm_output = keras.layers.LSTM(lstm_units, return_sequences=True)(
    decoder_embedding, initial_state=encoder_state
)
decoder_output = keras.layers.Dense(len(OUTPUT_CHARS)+1,
                                    activation='softmax')(decoder_lstm_output)

model = keras.models.Model(inputs=[encoder_input, decoder_input],
                           outputs=[decoder_output])

In [159]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='nadam',
              metrics=['accuracy'])

In [160]:
history = model.fit([X_train, X_train_decoder], Y_train, epochs=10,
                    validation_data=([X_valid, X_valid_decoder], Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [170]:
def predict_date_strs(date_strs):
  X = prepare_date_strs_padded(date_strs)
  Y_pred = tf.fill(dims=(len(X), 1), value=sos_id)
  for index in range(max_output_length):
    pad_size = max_output_length - Y_pred.shape[1]
    X_decoder = tf.pad(Y_pred, [[0, 0], [0, pad_size]])
    Y_probas_next = model.predict([X, X_decoder])[:, index:index+1]
    Y_pred_next = tf.argmax(Y_probas_next, axis=-1, output_type=tf.int32)
    Y_pred = tf.concat([Y_pred, Y_pred_next], axis=1)
  return ids_to_date_strs(Y_pred[:, 1:])

In [171]:
predict_date_strs(["July 14, 1789", "May 01, 2020"])

['1789-07-14', '2020-05-01']