In [2]:
from datasets import load_dataset_builder, load_dataset
import tensorflow as tf
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Preprocessing

In [3]:
dataset = load_dataset("lighteval/legal_summarization", "BillSum")

In [5]:
combined_texts = dataset["train"]["article"] + dataset["train"]["summary"]
# Create and adapt the TextVectorization layer
vocab_size = 10000
max_lenght = 1024
text_vectorization_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=max_lenght
)
text_vectorization_layer.adapt(combined_texts)
# Retrieve the current vocabulary
current_vocab = text_vectorization_layer.get_vocabulary()

# Define your special tokens
special_tokens = ["<SOS>", "<EOS>"]
# reserved_tokens = ["", "[UNK]"]

max_vocab_size = vocab_size - len(special_tokens)

new_vocab = current_vocab[:max_vocab_size] + special_tokens

# Set the new vocabulary to the TextVectorization layer
text_vectorization_layer.set_vocabulary(new_vocab)

2023-11-26 21:25:09.692015: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [6]:
X_train = tf.constant(dataset["train"]["article"])
X_valid = tf.constant(dataset["test"]["article"])
X_train_dec = tf.constant([f"<SOS> {s}" for s in dataset["train"]["summary"]])
X_valid_dec = tf.constant([f"<SOS> {s}" for s in dataset["test"]["summary"]])
y_train = text_vectorization_layer([f"{s} <EOS>" for s in dataset["train"]["summary"]])
y_valid = text_vectorization_layer([f"{s} <EOS>" for s in dataset["test"]["summary"]])

In [7]:
text = text_vectorization_layer(dataset["train"]["article"][0])
text
embed_layer = tf.keras.layers.Embedding(
    input_dim=vocab_size, output_dim=500, mask_zero=True
)

# Example text
example_text = tf.constant(["testing this thing"])

# Apply the TextVectorization layer
tokenized_text = text_vectorization_layer(example_text)

# Apply the Embedding layer
embedded_text = embed_layer(tokenized_text)

embedded_text
# m = tf.keras.models.Sequential([
#     text_vectorization_layer,
#     embed_layer

# ])
# Retrieve the vocabulary from the TextVectorization layer
vocabulary = text_vectorization_layer.get_vocabulary(include_special_tokens=True)
print("<SOS>" in vocabulary)
print("<EOS>" in vocabulary)
len(vocabulary)

True
True


10000

# Model

In [8]:
import keras_nlp

embed_size = 500
num_stacks = 3
num_heads_per_stack = 6
dropout_rate = 0.1
n_units = embed_size


# Define the model inputs
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)


# Apply the TextVectorization layer
encoder_input_vector = text_vectorization_layer(encoder_inputs)
decoder_input_vector = text_vectorization_layer(decoder_inputs)

# Define the shared embedding layer
embed_layer = tf.keras.layers.Embedding(
    input_dim=vocab_size, output_dim=embed_size, mask_zero=True
)

# Apply the shared embedding layer to both encoder and decoder inputs
encoder_embedding = embed_layer(encoder_input_vector)
decoder_embedding = embed_layer(decoder_input_vector)

# Define and apply the positional encoding layer
positional_encoding_layer = keras_nlp.layers.SinePositionEncoding()

encoder_in = positional_encoding_layer(encoder_embedding)
decoder_in = positional_encoding_layer(decoder_embedding)

Using TensorFlow backend


# Encoder&Decoder

In [9]:
# Encoder
Z = encoder_in
encoder_pad_mask = tf.math.not_equal(encoder_input_vector, 0)[:, tf.newaxis]
for _ in range(num_stacks):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads_per_stack, key_dim=embed_size, dropout=dropout_rate
    )
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.Dropout(dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

# Decoder
encoder_outputs = Z
Z = decoder_in
decoder_pad_mask = tf.math.not_equal(decoder_input_vector, 0)[:, tf.newaxis]
batch_max_len_dec = tf.shape(decoder_embedding)[1]
causal_mask = tf.linalg.band_part(  # creates a lower triangular matrix
    tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0
)
for _ in range(num_stacks):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads_per_stack, key_dim=embed_size, dropout=dropout_rate
    )
    Z = attn_layer(Z, value=Z, attention_mask=decoder_pad_mask & causal_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    cross_attentin_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads_per_stack, key_dim=embed_size, dropout=dropout_rate
    )
    # key and value from encoder compared to decoder query
    Z = cross_attentin_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(n_units, activation="relu")(Z)
    Z = tf.keras.layers.Dense(embed_size)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

Y_proba = tf.keras.layers.Dense(vocab_size, activation="softmax")(Z)

In [10]:
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(
    loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"]
)
model.fit(
    (X_train, X_train_dec),
    y_train,
    epochs=10,
    validation_data=((X_valid, X_valid_dec), y_valid),
)
model.summary()

Epoch 1/10


KeyboardInterrupt: 