# Problem
Recurrent neural networks can also be used as **generative models**.

This means that in addition to being used for predictive models (making predictions) 
they can learn the sequences of a problem and 
then generate entirely new plausible sequences for the problem domain.

In this lesson we are going to use the dataset: ``Alice’s Adventures in Wonderland``.

We are going to learn the dependencies between characters and 
the conditional probabilities of characters in sequences 
so that we can in turn generate wholly new and original sequences of characters.

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from keras.saving import load_model
import sys
import os


## Data preprocessing

In [None]:
# load dataset from file
dataset_path = "wonderland.txt"

# TODO: read file, save content into raw_text var
...
print(raw_text)

In [None]:
# TODO: replace "\n", "\t", "\r" with " " (space)
# in this way we can easily split dataset just using spaces 
raw_text = ...

In [None]:
# let's count how many word our dataset is made up of
# TODO: get all words (split by space)
words = ...

# TODO: get unique words
unique_words = ...

print(words)
print(unique_words)
print(len(unique_words))

In [None]:
# TODO: initialize our Encoder (words to int)
encoder = tf.keras.layers.TextVectorization(
    max_tokens=...,
    # if needed we could work with ngrams too. Must specify number of ngrams
    ngrams=None, # 3
)

# TODO: compute the vocabulary using adapt method
...

In [None]:
# TODO: get our vocabulary
vocab = ...

display(vocab[:100])
print(len(vocab))

In [None]:
# define our sequence length
sequence_len = 100

In [None]:
# hacking with sequences
# TODO: get first sequence
first_sequence = ...

display(first_sequence)

# TODO: convert first sequence (list[str]) into a string
display(...)

# TODO: get the encoding of that sequence
display(encoder(" ".join(first_sequence)))

In [None]:
# generate our supervised dataset
# we need an input text and a label
# our input text will be a sentence (of length: sequence_len)
# our label (or ground truth) will be next word
# Eg: 
#    input text: Alice is taking a
#    label:     nap
dataset_x = []  # text input
dataset_y = []  # labels

for i in range(len(words)-sequence_len):
    # TODO: get sequence
    seq_input = ...
    # TODO: get label
    seq_output = ...

    dataset_x.append(seq_input)
    dataset_y.append(seq_output)
    
# TODO: numpy conversion
dataset_x = ...
dataset_y = ...

In [None]:
print(dataset_x.shape)
print(dataset_y.shape)

In [None]:
x = dataset_x.reshape(-1, 1) # reshape to right shape

# labels are one-hot encoded
# TODO: get y encoding
y = ...
# TODO: compute one-hot encoding
y = ...

In [None]:
print(x.shape)
print(y.shape)
print()
print(x)
print(y)

In [None]:
# TODO: creating a Tensorflow Dataset from x and y
dataset = ...

display(dataset)
print(len(dataset))

In [None]:
# take and print first element of our dataset
for text, label in dataset.take(1):
    # text is an array made by one elem. It must be decoded to utf-8 to be 
    # displayed properly
    print("IN:", text[0].numpy().decode('utf-8'))
    print("LABEL:", label, f"({np.argmax(label)} --> {vocab[np.argmax(label)]})")

In [None]:
# train/test split
# using 5% of data as testing set
testing_len = int(len(dataset)*0.05)
training_len = len(dataset) - testing_len

print(training_len)
print(testing_len)

In [None]:
# generate train/test Tensorflow Datasets
# dividing in batch, prefetching elements and shuffling training set
# TODO: get training dataset, shuffle, divide in batch and apply prefetch
train_dataset = ...
# TODO: get testing dataset, divide in batch and apply prefetch
test_dataset = ...

In [None]:
# take and print first training batch
for text, label in train_dataset.take(1):
    print(text.shape)
    print(label.shape)

## LSTM Model

In [None]:
# defining our Keras model using LSTM layer
model = tf.keras.Sequential([
    # TODO: Encoder layer
    ...,
    # Trainable Embedding layer
    tf.keras.layers.Embedding(
        input_dim=..., # TODO
        output_dim=256,
        mask_zero=True,  # remember padding and masking
    ),
    # TODO:LSTM layer with 256 units
    ...,
    # Dropout layer
    tf.keras.layers.Dropout(0.2),
    # Output layer, using softmax
    ...,
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# callbacks
es_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=5,
    verbose=1,
    mode="min",
)

model_filepath="lstm_alice-mytraining.keras"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    model_filepath,
    monitor='loss',
    verbose=1,
    save_best_only=True,
    mode='min',
    )

In [None]:
history = model.fit(
    train_dataset,
    epochs=5, # Try with few epochs. We have a pre-trained model with 100 epochs
    callbacks=[es_callback, checkpoint_callback],
    )

In [None]:
# Plot loss during training phase
fig, axis = plt.subplots(1, 2)
axis[0].plot(history.history['loss'])
axis[0].legend(["training loss"])
axis[1].plot(history.history['accuracy'], color="tab:orange")
axis[1].legend(["training accuracy"])
fig.suptitle("Training phase")
plt.show()

In [None]:
def test_model_output(model, train_dataset):
    for text, label in train_dataset.take(1):
        res = model.predict(text)
        # TODO: convert one-hot encoding into token
        pred_tokens = ...
        
        print("Input Text:", text.shape)
        print("Model Preds:", res.shape)
        print("Pred Tokens:", pred_tokens.shape)
        print()
        
        for i, t in enumerate(text):
            print(t[0].numpy().decode('utf-8'))
            print("PRED:", vocab[pred_tokens[i]])
            print("TRUE:", vocab[np.argmax(label[i])])
            print()

In [None]:
# get some model preds
test_model_output(model, train_dataset)

In [None]:
# load pre-trained model
model_filepath="lstm_alice-pretrained-100e.keras"
# TODO: load pre-trained model
model_pretrained = ...
display(model_pretrained.summary())

In [None]:
# get some model preds using pre-trained model
test_model_output(model_pretrained, train_dataset)

In [None]:
def test_model_random_texts(model):
    in_text = [
        ["The unicorn is flying into the"],
        ["Monkey are very"],
        ["Alice is taking a"],
        ]
    res = model.predict(tf.convert_to_tensor(in_text))

    for i, t in enumerate(in_text):
        print(t[0], end=" ")
        print(vocab[np.argmax(res[i])])

In [None]:
test_model_random_texts(model)

In [None]:
test_model_random_texts(model_pretrained)

In [None]:
def autoregressive(model, tokens=20, sentence=2):
    sentences = [
        "Alice is taking a",
        "Monkey are very",
        "The unicorn is flying into the",
    ]
    in_text = sentences[sentence]

    for i in range(tokens):
        res = model.predict(tf.convert_to_tensor([[in_text]]))
        next_word = vocab[np.argmax(res)]
        in_text += f" {next_word}"
        
    print(in_text)

In [None]:
# testing model in an autoregressive mode
for i in range(3):
    autoregressive(model, sentence=i)

In [None]:
# testing pre-trained model in an autoregressive mode
for i in range(3):
    autoregressive(model_pretrained, sentence=i)

## Transformers

In [None]:
from keras_hub.layers import TokenAndPositionEmbedding, TransformerEncoder

In [None]:
# define our model using Transformer layer
model_transformer = Sequential([
    # TODO: Encoder layer
    ...,
    # Embedding with PositionEncoding layer
    # This layer creates a keras.layers.Embedding token embedding and a
    # keras_hub.layers.PositionEmbedding position embedding and sums their output when called
    TokenAndPositionEmbedding(
        vocabulary_size=..., # TODO
        sequence_length=..., # TODO
        embedding_dim=256, # model dim (d)
        mask_zero=True,
    ),
    # TODO: TransformerEncoder layer
    #       feedforward network dim: 64
    #       MultiHead attention heads: 8
    TransformerEncoder(
        ...,
    ),
    # Reduce tensor dimension by computing the mean over the temporal dimension (sequence length)
    # Our TransformerEncoder layers give us a tensor of shape (batch_size, sequence_length, embedding_dim)
    # That have an extra dimension: our final output shape must be (batch_size, vocab_length).
    # vocab_length --> our label is one-hot encoded !!
    # The final result will be (batch_size, embedding_dim)
    tf.keras.layers.GlobalAveragePooling1D(),
    # Dropout layer
    tf.keras.layers.Dropout(0.2),
    # Output layer
    tf.keras.layers.Dense(len(vocab), activation='softmax'),
])

model_transformer.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# want to check by yourself tensor shapes ?
# uncomment the following code and try to comment some model's layers
# model_transformer.predict(tf.convert_to_tensor([["im a test"]])).shape

In [None]:
# SaveBest callback
model_filepath = "transformers_alice-mytraining.keras"
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    model_filepath,
    monitor='loss',
    verbose=1,
    save_best_only=True,
    mode='min',
    )

In [None]:
history = model_transformer.fit(
    train_dataset,
    epochs=5,  # try using few epochs. We have a pre-trained version (100 epochs)
    callbacks=[checkpoint_callback]
    )

In [None]:
# Plot loss during training phase
fig, axis = plt.subplots(1, 2)
axis[0].plot(history.history['loss'])
axis[0].legend(["training loss"])
axis[1].plot(history.history['accuracy'], color="tab:orange")
axis[1].legend(["training accuracy"])
fig.suptitle("Training phase")
plt.show()

In [None]:
# load pre-trained model
model_filepath="transformers_alice-pretrained-100e.keras"
# TODO: load pre-train model
model_transformer_pretrained = ...
display(model_transformer_pretrained.summary())

In [None]:
# get some model preds
test_model_output(model_transformer, train_dataset)

In [None]:
# get some model preds using pre-trained model
test_model_output(model_transformer_pretrained, train_dataset)

In [None]:
test_model_random_texts(model_transformer)
test_model_random_texts(model_transformer_pretrained)

In [None]:
# testing model in an autoregressive mode
for i in range(3):
    autoregressive(model_transformer, sentence=i)

In [None]:
# testing pre-trained model in an autoregressive mode
for i in range(3):
    autoregressive(model_transformer_pretrained, sentence=i)

## LSTM vs Transformers

In [None]:
# TODO: evaluate models performance using testing dataset
eval_lstm = ...
eval_lstm_pretrained = ...
eval_transformers = ...
eval_transformers_pretrained = ...

In [None]:
print("LSTM (mytrain):", eval_lstm)
print("LSTM (pre-trained):", eval_lstm_pretrained)
print("Transformers (mytrain):", eval_transformers)
print("Transformers (pre-trained):", eval_transformers_pretrained)