In [1]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3


In [2]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
import numpy as np
import secrets
import time
import datetime
import re


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [3]:
from google.colab import drive
drive.mount("/content/gdrive")
drive_path = "gdrive/MyDrive/MachineLearning/HandsOnMachineLearning/chapter16"

Mounted at /content/gdrive


# 8.

In [None]:
class Node():
    def __init__(self, letter, next_nodes=None):
        self.letter = letter
        self.next_nodes = next_nodes

    def pick_next_node(self):
        return secrets.choice(self.next_nodes)

    def set_next_nodes(self, next_nodes):
        self.next_nodes = next_nodes


class ReberString():
    def __init__(self, postlayers):
        self.end = Node("E")
        layer3_V = Node("V")
        layer3_S = Node("S")
        layer3_X = Node("X")
        layer3_P = Node("P")
        layer2_V = Node("V")
        layer2_T = Node("T")
        layer2_X = Node("X")
        layer2_S = Node("S")
        layer1_P = Node("P")
        layer1_T = Node("T")
        self.start = Node("B")

        self.end.set_next_nodes(postlayers)
        layer3_V.set_next_nodes([end])
        layer3_S.set_next_nodes([end])
        layer3_X.set_next_nodes([layer2_T, layer2_V])
        layer3_P.set_next_nodes([layer3_X, layer3_S])

        layer2_V.set_next_nodes([layer3_P, layer3_V])
        layer2_T.set_next_nodes([layer2_T, layer2_V])
        layer2_X.set_next_nodes([layer3_X, layer3_S])
        layer2_S.set_next_nodes([layer2_S, layer2_X])

        layer1_P.set_next_nodes([layer2_T, layer2_V])
        layer1_T.set_next_nodes([layer2_S, layer2_X])
        self.start.set_next_nodes([layer1_P, layer1_T])


end = Node("E")
postlayer_P = Node("P", [end])
postlayer_T = Node("T", [end])
string_P = ReberString(postlayer_P)
string_T = ReberString(postlayer_T)
prelayer_P = Node("P", [string_P.start])
prelayer_T = Node("T", [string_T.start])
start = Node("B", [prelayer_T, prelayer_P])

unique_letters = ["B", "E", "P", "S", "T", "V", "X"]

In [None]:
def generate_valid_sequence(start, join=True):
    letters = []
    cur_node = start

    while cur_node.next_nodes is not None:
        letters.append(cur_node.letter)
        cur_node = cur_node.pick_next_node()

    if join:
        valid_string = "".join(letters)
        return valid_string
    return letters


def one_hot_encoding(string, unique_letters):
    return [unique_letters.index(c) for c in string]


def create_valid_data(length, start, unique_letters):
    valid_strings = []
    for _ in range(length//2):
        valid_string = generate_valid_sequence(start)
        valid_strings.append(one_hot_encoding(valid_string, unique_letters))
    return valid_strings


def create_invalid_data(length, start, unique_letters, n_invalid_letters=1):
    invalid_strings = []
    for _ in range(length//2):
        letters = generate_valid_sequence(start, False)
        indices = secrets.SystemRandom().sample(range(len(letters)), n_invalid_letters)
        for i in indices:
            cur_letter = letters[i]
            new_letter = secrets.choice([letter for letter in unique_letters
                                        if letter != cur_letter])
            letters[i] = new_letter
        new_string = "".join(letters)
        invalid_strings.append(one_hot_encoding(new_string, unique_letters))
    return invalid_strings



def create_dataset(length, start, unique_letters, training=False):
    valid_data = create_valid_data(length, start, unique_letters)
    invalid_data = create_invalid_data(length, start, unique_letters, 1)
    data = [*valid_data, *invalid_data]
    X = tf.ragged.constant(data, ragged_rank=1)
    y = np.array([[1.] for _ in range(len(valid_data))] +
                 [[0.] for _ in range(len(invalid_data))])
    return X, y


X_train, y_train = create_dataset(7500, start, unique_letters, True)
X_valid, y_valid = create_dataset(1500, start, unique_letters)
X_test, y_test = create_dataset(1000, start, unique_letters)

In [None]:
filepath = time.strftime(f"{drive_path}/models/reberstring_%Y_%m_%d-%H_%M_%S")
checkpoint_cb = keras.callbacks.ModelCheckpoint(
    filepath,
    monitor='val_loss',
    verbose=0,
    save_best_only=True,
    save_weights_only=False,
    mode='auto',
    save_freq='epoch',
    initial_value_threshold=None
)
embedding_size = 5

model = keras.models.Sequential([
    layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),
    layers.Embedding(input_dim=len(unique_letters), output_dim=embedding_size),
    layers.GRU(30),
    layers.Dense(1, activation="sigmoid")
])
optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum=0.95, nesterov=True)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid),
          callbacks=[checkpoint_cb])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x78700f0a76a0>

# 9.

In [51]:
YEARS = list(range(1700, 2031))
MONTHS = list(range(1, 13))
DAYS = list(range(1, 32))
MONTHS_TEXT = [
    "January", "February", "March", "April", "May", "June",
    "July", "August", "September", "October", "November", "December"
]
INPUT_CHARS = "".join(sorted(set("".join(MONTHS_TEXT) + "0123456789, ")))
OUTPUT_CHARS = "0123456789-"


def create_dates(length):
    contexts = []
    targets = []

    i = 0
    while i < length:
        try:
            year = secrets.choice(YEARS)
            month = secrets.choice(MONTHS)
            day = secrets.choice(DAYS)
            date = datetime.datetime(year, month, day)
            context = date.strftime("%B %d, %Y").replace(' 0', ' ')
            target = date.strftime("%Y-%m-%d")
            contexts.append(context)
            targets.append(target)
            i += 1
        except ValueError:
            continue

    return contexts, targets


def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]


def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID


def create_dataset(n_dates):
    x, y = create_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [52]:
X_train, y_train = create_dataset(10000)
X_valid, y_valid = create_dataset(2000)
X_test, y_test = create_dataset(2000)

In [41]:
sos_id = len(OUTPUT_CHARS) + 1

def shifted_output_sequences(Y):
    sos_tokens = tf.fill(dims=(len(Y), 1), value=sos_id)
    return tf.concat([sos_tokens, Y[:, :-1]], axis=1)

X_train_decoder = shifted_output_sequences(y_train)
X_valid_decoder = shifted_output_sequences(y_valid)
X_test_decoder = shifted_output_sequences(y_test)

In [42]:
np.random.seed(42)
tf.random.set_seed(42)

encoder_embedding_size = 32
decoder_embedding_size = 32
units = 128

encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)

encoder_embeddings = keras.layers.Embedding(
    len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)

decoder_embedding_layer = keras.layers.Embedding(
    len(OUTPUT_CHARS) + 2, decoder_embedding_size)
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

encoder = keras.layers.LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]

sampler = tfa.seq2seq.sampler.TrainingSampler()

decoder_cell = keras.layers.LSTMCell(units)
output_layer = keras.layers.Dense(len(OUTPUT_CHARS) + 1)

decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,
                                                 sampler,
                                                 output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings,
    initial_state=encoder_state)
Y_proba = keras.layers.Activation("softmax")(final_outputs.rnn_output)

model = keras.models.Model(inputs=[encoder_inputs, decoder_inputs],
                           outputs=[Y_proba])
model.compile(optimizer="nadam", loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [53]:
model.fit([X_train, X_train_decoder], y_train, epochs=15,
           validation_data=([X_valid, X_valid_decoder], y_valid))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
 69/313 [=====>........................] - ETA: 5s - loss: 6.7699e-04 - accuracy: 1.0000

KeyboardInterrupt: 

In [44]:
max_output_length = y_train.shape[1]

inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(
    embedding_fn=decoder_embedding_layer
)
inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell, inference_sampler, output_layer=output_layer,
    maximum_iterations=max_output_length
)
batch_size = tf.shape(encoder_inputs)[:1]
start_tokens = tf.fill(dims=batch_size, value=sos_id)
final_outputs, final_state, final_sequence_lengths = inference_decoder(
    start_tokens,
    initial_state=encoder_state,
    start_tokens=start_tokens,
    end_token=0
)

inference_model = keras.models.Model(inputs=[encoder_inputs],
                                     outputs=[final_outputs.sample_id])

In [45]:
max_input_length = X_train.shape[1]

def prepare_date_strs_padded(date_strs):
    X = prepare_date_strs(date_strs)
    if X.shape[1] < max_input_length:
        X = tf.pad(X, [[0, 0], [0, max_input_length - X.shape[1]]])
    return X


def ids_to_date_strs(ids, chars=OUTPUT_CHARS):
    return ["".join([("?" + chars)[index] for index in sequence])
            for sequence in ids]

In [55]:
def fast_predict_date_strs(date_strs):
    X = prepare_date_strs_padded(date_strs)
    Y_pred = inference_model.predict(X)
    return ids_to_date_strs(Y_pred)


# the model is taught on format without the 0 prefix
# that explains bad performance on such instances
fast_predict_date_strs(["July 01, 1789", "July 25, 1789", "May 1, 2020", "May 14, 2020"])



['1789-07-01', '1789-07-25', '2020-05-01', '2020-05-14']

# 11.

In [None]:
from transformers import TFOpenAIGPTLMHeadModel
from transformers import OpenAIGPTTokenizer


model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")

prompt_text = "This royal throne of kings, this sceptred isle"
encoded_prompt = tokenizer.encode(prompt_text,
                                  add_special_tokens=False,
                                  return_tensors="tf")
encoded_prompt

In [None]:
num_sequences = 5
length = 40

generated_sequences = model.generate(
    input_ids=encoded_prompt,
    do_sample=True,
    max_length=length + len(encoded_prompt[0]),
    temperature=1.0,
    top_k=0,
    top_p=0.9,
    repetition_penalty=1.0,
    num_return_sequences=num_sequences,
)

generated_sequences

In [None]:
for sequence in generated_sequences:
    text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)
    print(text)
    print("-" * 80)