# Tolstoy's Transformer

In this notebook first couple cells include same code as the given notebook. Afterwards, I added project gutenberg with tolstoys 4 books. Books first downloaded and first 10k char ignored as per done in reference web link. Then, they are filtered by the start/end markings. Afterwards I made tests with training multiple transformers and shared intuitions. I used Google colabs priced version with High Ram to have enough memory to experiment so it may be hard to rerun all.

After each model run I eloborated how i like or didnt liked it. In the last model I really liked it and stopped iteration. Also shared keras and h5 weight file of it over submission for you to be able to run. I also shared my txt files in case there is an path issue happens in the code to download them. Because i faced it couple times.

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random
import matplotlib.pyplot as plt

In [2]:
def attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [3]:
def create_model():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    lowercased = tf.strings.lower(input_string)
    stripped_html = tf.strings.regex_replace(lowercased, "<br />", " ")
    return tf.strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")
def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y

class TextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model.
    1. Feed some starting prompt to the model
    2. Predict probabilities for the next token
    3. Sample the next token and add it to the next input

    Arguments:
        max_tokens: Integer, the number of tokens to be generated after prompt.
        start_tokens: List of integers, the token indices for the starting prompt.
        index_to_word: List of strings, obtained from the TextVectorization layer.
        top_k: Integer, sample from the `top_k` token predictions.
        print_every: Integer, print after this many epochs.
    """

    def __init__(self,
                 max_tokens, start_tokens,
                 index_to_word, top_k=10,
                 print_every=1):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.index_to_word = index_to_word
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.index_to_word[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while num_tokens_generated <= self.max_tokens:
            pad_len = maxlen - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:maxlen]
                sample_index = maxlen - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = " ".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")


Download gutenbergs, There are 4 book to be implemented.

In [19]:
def download_gutenberg_books(url,name):
    import requests
    os.makedirs("data", exist_ok=True)
    response = requests.get(url)
    cleaned_text = response.text[10000:]
    # Write the cleaned text to the file
    filepath = os.path.join("data", name)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

    print(f"Downloaded and cleaned file saved as: {filepath}")

In [22]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 128  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
#bugger size
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
files= []
filenames = [
    'war_and_peace.txt',
    'anna_karenina.txt',
    'kreutzer_sonata.txt',
    'resurrection.txt'
]
urls = [
    'https://www.gutenberg.org/files/2600/2600-0.txt',
    'https://www.gutenberg.org/files/1399/1399-0.txt',
    'https://www.gutenberg.org/files/689/689-0.txt',
    'https://www.gutenberg.org/files/1938/1938-0.txt',

]
for i in range(len(urls)):
  download_gutenberg_books(urls[i],filenames[i])

for f in os.listdir('data'):
    files.append(os.path.join('data', f))

print(f"{len(files)} files")

# Create a dataset from text files
random.shuffle(files)
text_ds = tf.data.TextLineDataset(files)
text_ds = text_ds.shuffle(buffer_size=1024)
text_ds = text_ds.batch(batch_size)


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
start_prompt = "It was a cold morning, "
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

Downloaded and cleaned file saved as: data/war_and_peace.txt
Downloaded and cleaned file saved as: data/anna_karenina.txt
Downloaded and cleaned file saved as: data/kreutzer_sonata.txt
Downloaded and cleaned file saved as: data/resurrection.txt
4 files


In [23]:
num_tokens_generated = 40

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)

In [15]:
model = create_model()
model.get_layer(index=2).ffn.summary()

model.fit(text_ds, verbose=2, epochs=10, callbacks=[text_gen_callback])

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7e391c289330>

There are lots of unknown tokens found. For that I will increase vocabulary but before that lets filter files in a different way like start/end markers. TO give better data to the model. As per language usage used words were either so common or basic. So there is no good creativity or understanding of language.

In [6]:
def download_gutenberg_books(url, name):
    import requests
    os.makedirs("data", exist_ok=True)
    response = requests.get(url)

    # Detect and extract text between the markers
    text = response.text
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK ***"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK ***"

    start_index = text.find(start_marker)
    end_index = text.find(end_marker)

    if start_index != -1:
        start_index += len(start_marker)
    else:
        start_index = 10000 # in 5 line gpt reference said to be its on evarage 10k char

    if end_index == -1:
        end_index = len(text)

    cleaned_text = text[start_index:end_index].strip()

    filepath = name#os.path.join("data", name)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(cleaned_text)

    print(f"Downloaded and cleaned file saved as: {filepath}")


In [7]:
vocab_size = 40000  # Only consider the top 20k words
maxlen = 128  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
#bugger size
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files
files= []
for i in range(len(urls)):
  download_gutenberg_books(urls[i],filenames[i])

for f in os.listdir('data'):
        files.append(os.path.join('data', f))

print(f"{len(files)} files")

# Create a dataset from text files
random.shuffle(files)
text_ds = tf.data.TextLineDataset(files)
text_ds = text_ds.shuffle(buffer_size=1024)
text_ds = text_ds.batch(batch_size)


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
start_prompt = "It was a cold morning, "
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

Downloaded and cleaned file saved as: war_and_peace.txt
Downloaded and cleaned file saved as: anna_karenina.txt
Downloaded and cleaned file saved as: kreutzer_sonata.txt
Downloaded and cleaned file saved as: resurrection.txt
4 files


In [8]:
num_tokens_generated = 40

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)
model = create_model()
model.get_layer(index=2).ffn.summary()

model.fit(text_ds, verbose=2, epochs=10, callbacks=[text_gen_callback])

Epoch 1/10


  self.gen.throw(typ, value, traceback)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms

<keras.src.callbacks.history.History at 0x7aafc853a140>

I think the reponses are pretty close with the previous one. Its inconsistent for deep understanding. Sometimes it feels like deep and creative but sometimes not. Lets increase vocab and try more. Compared to previous there were a bit more complex words but still not good enough.

In [26]:
vocab_size = 40000  # Only consider the top
maxlen = 128  # Max sequence size
embed_dim = 128  # Embedding size for each token
num_heads = 4  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
#bugger size
batch_size = 128

# The dataset contains each review in a separate text file
# The text files are present in four different folders
# Create a list all files


# Create a dataset from text files
random.shuffle(files)
text_ds = tf.data.TextLineDataset(files)
text_ds = text_ds.shuffle(buffer_size=1024)
text_ds = text_ds.batch(batch_size)


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
start_prompt = "I think"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

Downloaded and cleaned file saved as: war_and_peace.txt
Downloaded and cleaned file saved as: anna_karenina.txt
Downloaded and cleaned file saved as: kreutzer_sonata.txt
Downloaded and cleaned file saved as: resurrection.txt
5 files


In [27]:
num_tokens_generated = 50

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)
model = create_model()
model.get_layer(index=2).ffn.summary()

model.fit(text_ds, verbose=2, epochs=10, callbacks=[text_gen_callback])

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7e38b4c622f0>

Now its better unkowns are only in start. Texts are a bit short lets increase it and decrease vocab a bit to 30k see how it goes we increase it from 20k to 40k now back to 30k. I will increase attention heads to try, I want to try it with 6 to see how itreacts for non 2n numbers. Also to train more epochs to 20. As per deep understanding, I liked this its not best but still I can see it learned some good words and able to combine some of those.

In [28]:
vocab_size = 30000  # Only consider the top
maxlen = 128  # Max sequence size
embed_dim = 128  # Embedding size for each token
num_heads = 6  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
#bugger size
batch_size = 128

random.shuffle(files)
text_ds = tf.data.TextLineDataset(files)
text_ds = text_ds.shuffle(buffer_size=1024)
text_ds = text_ds.batch(batch_size)


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
start_prompt = "I think"
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

In [29]:
num_tokens_generated = 200

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)
model = create_model()
model.get_layer(index=2).ffn.summary()

model.fit(text_ds, verbose=2, epochs=20, callbacks=[text_gen_callback])

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7e38fc169600>

It was bad at the end and in the middles not good. Unk tokens are not seen anymore but it overfitted with porject gutenberg word. Lets readuce heads. increase embed dimension to cactch more feature and increase length to catch more deeper. This will be hard to process.

In [9]:
vocab_size = 30000  # Only consider the top 20k words
maxlen = 512  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 4  # Number of attention heads
feed_forward_dim = 128  # Hidden layer size in feed forward network inside transformer
#bugger size
batch_size = 128

text_ds = tf.data.TextLineDataset(files)
text_ds = text_ds.shuffle(buffer_size=1024)
text_ds = text_ds.batch(batch_size)


# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Tokenize starting prompt
word_to_index = {}
for index, word in enumerate(vocab):
    word_to_index[word] = index
start_prompt = "It was a cloudy night "
start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]

In [10]:
num_tokens_generated = 500

text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)
model = create_model()
model.get_layer(index=2).ffn.summary()

model.fit(text_ds, verbose=2, epochs=15, callbacks=[text_gen_callback])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step

<keras.src.callbacks.history.History at 0x7aafc2a335b0>

This is amazing. it used complex words like "recalled", "absurdity", "extraordinary".
This output was also good "it was a cloudy night produced strong ; strong teeth flashing".

After 10th epoch almost all generations was using complex words in a meaninful and creative manner. this shows deeper understanding of the authors is gotten. Also this ran took 40minute with 33GB gpu ram with A100 gpu.

I was planning to add more layers to this to keep experimenting but I liked the result so much that I do not think its needed. I will be attaching keras and h5 file for you to also use model without training again.

In [18]:
model.save_weights('last.weights.h5')

In [17]:
model.save('all_model.keras')