## Setup

This section installs required packages, and initializes some imports and helper functions to keep the notebook code below neater.

In [1]:
#!pip uninstall tensorflow -yq
#!pip install tensorflow-gpu>=2.0 gpustat -Uq

# GPU selection
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
from IPython.core.display import display, HTML

def export_html(result, max_activation):
    output = ""
    max_activation += 1e-8
    
    for line in result:
        word, activation = line
            
        if activation>0:
            activation = activation/max_activation
            colour = str(int(255 - activation*255))
            tag_open = "<span style='background-color: rgb(255,"+colour+","+colour+");'>"
            
        else:
            activation = -1 * activation/max_activation
            colour = str(int(255 - activation*255))
            tag_open = "<span style='background-color: rgb("+colour+","+colour+",255);'>"
            
        tag_close = "</span>"
        tag = " ".join([tag_open, word, tag_close])
        
        output = output + tag
        
    output = output + ""
    
    return output

In [3]:
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = "retina"
import tensorflow.compat.v2 as tf
from tensorflow.keras import layers

In [4]:
def train_simple_lm(model, x_train, y_train, verbose=2, test=False):
    start_time = time.time()

    print("[Phase 1/3] Warming up...")
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["acc"])
    history_1 = model.fit(x_train, y_train, epochs=3,
                          batch_size=128, shuffle=False,
                          callbacks=[], verbose=verbose)
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

    if not test:
        print("[Phase 2/3] Fast training...")
        opt = tf.keras.optimizers.Adam(learning_rate=0.01)
        model.compile(loss="sparse_categorical_crossentropy",
                    optimizer=opt,
                    metrics=["acc"])
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='acc',
                                                      restore_best_weights=True,
                                                      patience=3)
        history_2 = model.fit(x_train, y_train, epochs=100,
                            batch_size=256, shuffle=False,
                            callbacks=[early_stop], verbose=verbose)
        scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
        print(" - Loss:", scores[0])
        print(" - Acc: ", scores[1])

        print("[Phase 3/3] Train to convergence...")
        opt = tf.keras.optimizers.Adam(learning_rate=0.001)
        model.compile(loss="sparse_categorical_crossentropy",
                    optimizer=opt,
                    metrics=["acc"])
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='acc',
                                                      restore_best_weights=True,
                                                      patience=3)
        history_3 = model.fit(x_train, y_train, epochs=200,
                              batch_size=256, shuffle=True,
                              callbacks=[early_stop], verbose=verbose)
        scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
        print(" - Loss:", scores[0])
        print(" - Acc: ", scores[1])
        
        opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
        model.compile(loss="sparse_categorical_crossentropy",
                    optimizer=opt,
                    metrics=["acc"])
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='acc',
                                                      restore_best_weights=True,
                                                      patience=10)
        history_4 = model.fit(x_train, y_train, epochs=200,
                              batch_size=128, shuffle=True,
                              callbacks=[early_stop], verbose=verbose)
        scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
        print(" - Loss:", scores[0])
        print(" - Acc: ", scores[1])

        log_x = history_1.history['loss'] + history_2.history['loss'] + history_3.history['loss'] + history_4.history['loss']
        plt.plot(log_x)
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.show()
    elif test:
        log_x = history_1.history['loss']
        plt.plot(log_x)
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.show()

    end_time = time.time()

    print("Done! Training took", int(end_time-start_time), "seconds")

    return model

# Exploring RNNs

In this notebook, we will train an **LSTM** and a vanilla **RNN** (Keras `SimpleRNN`) on a small language modelling task and visualize how an LSTM or RNN works when learning how to model sequences.

We will visualize the **activations**, **hidden states** and **information dependency** inside these models.

In [5]:
!gpustat

[1m[37m20bcf3adfadd           [m  Fri Jan  3 08:28:39 2020  [1m[30m430.50[m
[36m[0][m [34mTesla V100-DGXS-16GB[m |[31m 44'C[m, [32m  0 %[m | [36m[1m[33m  138[m / [33m16155[m MB |
[36m[1][m [34mTesla V100-DGXS-16GB[m |[31m 44'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m16158[m MB |
[36m[2][m [34mTesla V100-DGXS-16GB[m |[31m 46'C[m, [32m  0 %[m | [36m[1m[33m  469[m / [33m16158[m MB |
[36m[3][m [34mTesla V100-DGXS-16GB[m |[31m 44'C[m, [32m  0 %[m | [36m[1m[33m    0[m / [33m16158[m MB |


In [6]:
seq_len = 64
model_dim = 128

## Load Text Data

We will load a short paragraph from Wikipedia about NVIDIA.

The goal here is to train an LSTM and RNN to autocomplete the passage.

In [7]:
#text = "Nvidia Corporation is more commonly referred to as Nvidia. It was formerly stylized as nVidia on products from the mid 90s to early 2000s. Nvidia is an American technology company incorporated in Delaware and based in Santa Clara, California. Nvidia designs graphics processing units for the gaming and professional markets, as well as system on a chip units for the mobile computing and automotive market. Nvidia primary GPU product line, labeled GeForce, is in direct competition with Advanced Micro Devices Radeon products. Nvidia expanded its presence in the gaming industry with its handheld Shield Portable, Shield Tablet, and Shield Android TV. Since 2014, Nvidia has diversified its business focusing on four markets: gaming, professional visualization, data centers, and auto. Nvidia is also now focused on artificial intelligence. In addition to GPU manufacturing, Nvidia provides parallel processing capabilities to researchers and scientists that allow them to efficiently run high performance applications. They are deployed in supercomputing sites around the world. "

In [8]:
text_url = "https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt"
text_path = tf.keras.utils.get_file("shakespeare.txt", text_url)
text_lines = [line.rstrip('\n').strip() for line in open(text_path)]
text = " ".join(text_lines)

In [9]:
text = text.lower().replace("  ", " ").replace(" ", "_")
text_len = len(text)
print("Text length:", text_len)

vocab = sorted(set(text))
vocab_size = len(vocab) + 1
print("Vocab size:", vocab_size)

tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True, char_level=True)
tokenizer.fit_on_texts([text])

tokens = tokenizer.texts_to_sequences([text])[0]

x_train = []
y_train = []

for i in range(0, text_len-seq_len, int(seq_len/2)):
    x_train.append(tokens[i:i+seq_len])
    y_train.append(tokens[i+seq_len])
    
print("Training examples:", len(y_train))

x_train, y_train = np.asarray(x_train), np.asarray(y_train)

Text length: 4960178
Vocab size: 64
Training examples: 155004


## Build LSTM model

In [10]:
TRAIN = False

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False)

l_input = layers.Input(shape=(seq_len,))
l_embed = layers.Embedding(vocab_size, model_dim)(l_input)
l_rnn_1, state_h, state_c = layers.LSTM(model_dim,
                                        return_state=True,
                                        return_sequences=False)(l_embed)
preds = layers.Dense(vocab_size,
                     activation="softmax")(l_rnn_1)

model = tf.keras.models.Model(inputs=l_input, outputs=preds)
model.summary()

if TRAIN:
    model = train_simple_lm(model, x_train, y_train, verbose=1)
    model.save_weights("lstm.h5")
else:
    print("Loading pretrained LSTM model:")
    model_url = "https://github.com/OpenSUTD/machine-learning-workshop/releases/download/v0.0.03/lstm.h5"
    model_path = tf.keras.utils.get_file("lstm_large.h5", model_url)
    model.load_weights(model_path)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           8192      
_________________________________________________________________
lstm (LSTM)                  [(None, 128), (None, 128) 131584    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
Total params: 148,032
Trainable params: 148,032
Non-trainable params: 0
_________________________________________________________________
Loading pretrained LSTM model:
155004/1 - 4s - loss: 1.1887 - acc: 0.6372
 - Loss: 1.171395713586333
 - Acc:  0.6372158


In [11]:
def plot_dependency(n):
    input_text = text[n:n+seq_len]
    input_tokens = tokenizer.texts_to_sequences([input_text])
    label = [text[n+seq_len]]
    label = tokenizer.texts_to_sequences([label])
    loss = tf.keras.losses.SparseCategoricalCrossentropy()
    x = tf.convert_to_tensor(input_tokens, dtype=tf.float32)
    y_true = tf.convert_to_tensor(label, dtype=tf.float32)

    with tf.GradientTape() as g:
        g.watch(x)
        y = model(x)
        loss_value = loss(y_true, y)
        grads = g.gradient(loss_value, model.trainable_weights)
    input_grads = grads[0].values.numpy()
    input_grads = np.sum(np.abs(input_grads), axis=-1)

    result = zip(input_text, input_grads)
    output = export_html(result, max(input_grads))
    output = output + " &nbsp; -> &nbsp; " + text[n+seq_len]
    output = "<tt>" + output + "</tt>"
    display(HTML(output))

In [12]:
s = 200000
for i in range(s,s+20):
    plot_dependency(i)

## Build RNN model

In [13]:
TRAIN = False

# improve vanilla RNN training speed
# LSTM doesn't need this since it has a cuDNN implementation
tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True)
unroll = True

l_input = layers.Input(shape=(seq_len,))
l_embed = layers.Embedding(vocab_size, model_dim)(l_input)
l_rnn_1, h = layers.SimpleRNN(model_dim,
                              unroll=unroll,
                              return_state=True,
                              return_sequences=False)(l_embed)
preds = layers.Dense(vocab_size,
                     activation="softmax")(l_rnn_1)

model = tf.keras.models.Model(inputs=l_input, outputs=preds)
model.summary()

if TRAIN:
    model = train_simple_lm(model, x_train, y_train, verbose=2)
    model.save_weights("rnn.h5")
else:
    print("Loading pretrained RNN model:")
    model_url = "https://github.com/OpenSUTD/machine-learning-workshop/releases/download/v0.0.03/rnn.h5"
    model_path = tf.keras.utils.get_file("rnn_large.h5", model_url)
    model.load_weights(model_path)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           8192      
_________________________________________________________________
simple_rnn (SimpleRNN)       [(None, 128), (None, 128) 32896     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
Total params: 49,344
Trainable params: 49,344
Non-trainable params: 0
_________________________________________________________________
Loading pretrained RNN model:
155004/1 - 9s - loss: 1.5443 - acc: 0.5454
 - Loss: 1.4836253579953498
 - Acc:  0.54537946


In [14]:
for i in range(s,s+20):
    plot_dependency(i)

## Bi-LSTM

In [15]:
TRAIN = False

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False)

l_input = layers.Input(shape=(seq_len,))
l_embed = layers.Embedding(vocab_size, model_dim)(l_input)
l_rnn_1 = layers.Bidirectional(layers.LSTM(model_dim,
                                           return_sequences=False))(l_embed)
preds = layers.Dense(vocab_size,
                     activation="softmax")(l_rnn_1)

model = tf.keras.models.Model(inputs=l_input, outputs=preds)
model.summary()

if TRAIN:
    model = train_simple_lm(model, x_train, y_train, verbose=2)
    model.save_weights("bilstm.h5")
else:
    print("Loading pretrained LSTM model:")
    model_url = "https://github.com/OpenSUTD/machine-learning-workshop/releases/download/v0.0.03/bilstm.h5"
    model_path = tf.keras.utils.get_file("bilstm_large.h5", model_url)
    model.load_weights(model_path)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           8192      
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               263168    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
Total params: 287,808
Trainable params: 287,808
Non-trainable params: 0
_________________________________________________________________
Loading pretrained LSTM model:
155004/1 - 4s - loss: 0.9645 - acc: 0.7105
 - Loss: 0.9497118522415118
 - Acc:  0.7105236


In [16]:
for i in range(s,s+20):
    plot_dependency(i)

## Stacked LSTM

In [17]:
TRAIN = False

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(False)

l_input = layers.Input(shape=(seq_len,))
l_embed = layers.Embedding(vocab_size, model_dim)(l_input)
l_embed = layers.LSTM(model_dim,
                      return_sequences=True)(l_embed)
l_embed = layers.LSTM(model_dim,
                      return_sequences=True)(l_embed)
l_rnn_1 = layers.LSTM(model_dim,
                      return_sequences=False)(l_embed)
preds = layers.Dense(vocab_size,
                     activation="softmax")(l_rnn_1)

model = tf.keras.models.Model(inputs=l_input, outputs=preds)
model.summary()

if TRAIN:
    model = train_simple_lm(model, x_train, y_train, verbose=2)
    model.save_weights("deeplstm.h5")
else:
    print("Loading pretrained LSTM model:")
    model_url = "https://github.com/OpenSUTD/machine-learning-workshop/releases/download/v0.0.03/deeplstm.h5"
    model_path = tf.keras.utils.get_file("deeplstm_large.h5", model_url)
    model.load_weights(model_path)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 64)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           8192      
_________________________________________________________________
lstm (LSTM)                  (None, 64, 128)           131584    
_________________________________________________________________
lstm_1 (LSTM)                (None, 64, 128)           131584    
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
Total params: 411,200
Trainable params: 411,200
Non-trainable params: 0
_______________________________________________________

In [18]:
for i in range(s,s+20):
    plot_dependency(i)

In [19]:
from xfmers import layers, utils, ops

In [20]:
TRAIN = False

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True)

inputs = tf.keras.Input(shape=(None, ))
padding_mask = layers.PaddingMaskGenerator()(inputs)
embeddings = layers.TokenPosEmbedding(d_vocab=vocab_size, d_model=model_dim, pos_length=seq_len)(inputs)

decoder_block = layers.TransformerStack(layers=3,
                                        ff_units=model_dim*4,
                                        d_model=model_dim,
                                        num_heads=4,
                                        dropout=0.01,
                                        causal=True,
                                        layer_norm="double",
                                        activation=ops.gelu,
                                        weight_sharing=False,
                                        name="DecoderBlock")

dec_outputs = decoder_block({"token_inputs": embeddings,
                             "mask_inputs": padding_mask})

dec_outputs = dec_outputs[:, -1:]

preds = layers.LMHead(vocab_size=vocab_size)(dec_outputs)
    
model = tf.keras.Model(inputs=inputs, outputs=preds)

model.summary()

if TRAIN:
    verbose = 1
    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["acc"])
    history_1 = model.fit(x_train, y_train, epochs=3,
                          batch_size=128, shuffle=False,
                          callbacks=[], verbose=verbose)
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

    opt = tf.keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["acc"])
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='acc',
                                                  restore_best_weights=True,
                                                  patience=3)
    history_3 = model.fit(x_train, y_train, epochs=200,
                          batch_size=256, shuffle=True,
                          callbacks=[early_stop], verbose=verbose)
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

    log_x = history_1.history['loss'] + history_3.history['loss']
    plt.plot(log_x)
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.show()

    model.save_weights("transformer_xs.h5")
else:
    print("Loading pretrained LSTM model:")
    model_url = "https://github.com/OpenSUTD/machine-learning-workshop/releases/download/v0.0.03/transformer_xs.h5"
    model_path = tf.keras.utils.get_file("transformer_xs_large.h5", model_url)
    model.load_weights(model_path)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer="adam",
                  metrics=["acc"])
    scores = model.evaluate(x_train, y_train, batch_size=512, verbose=2)
    print(" - Loss:", scores[0])
    print(" - Acc: ", scores[1])

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
PaddingMaskGenerator (PaddingMa (None, 1, 1, None)   0           input_1[0][0]                    
__________________________________________________________________________________________________
TokenPosEmbedding (TokenPosEmbe (None, None, 128)    16384       input_1[0][0]                    
__________________________________________________________________________________________________
DecoderBlock (TransformerStack) (None, None, 128)    594816      PaddingMaskGenerator[0][0]       
                                                                 TokenPosEmbedding[0][0]      

In [21]:
for i in range(s,s+20):
    plot_dependency(i)