## Lets do it again (but properly)

In [53]:
import os
import tensorflow as tf
from tensorflow import keras
import keras_nlp
import numpy as np
from keras.preprocessing.text import Tokenizer

keras_nlp

<module 'keras_nlp' from 'C:\\Users\\Vladko\\anaconda3\\lib\\site-packages\\keras_nlp\\__init__.py'>

In [54]:
# Data
BATCH_SIZE = 256
SEQ_LEN = 256

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 2048
NUM_HEADS = 4
MAX_VOCAB_SIZE = 5000  # Limits parameters in model.

# Training
EPOCHS = 1

In [55]:
raw_train_ds = (
    tf.data.TextLineDataset("data/v3/coupled_ngrams.txt")
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)
raw_val_ds = (
    tf.data.TextLineDataset("data/validation/test_ngrams.txt")
    .batch(BATCH_SIZE)
)

In [56]:
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=MAX_VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]", "nl", "$"],
)

In [57]:
len(vocab)

4757

In [58]:
def deserialize_vocab():
    filename = "data/v3/data/V4-dict(v1).txt"
    vocabulary = []
    with open(filename, encoding="utf8") as f:
        vocab_load = f.readlines()
        for w in vocab_load:
            vocabulary.append(w.strip())
    return vocabulary

vocab = deserialize_vocab()
len(vocab), vocab[15]

tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)
len(vocab)

4765

In [59]:
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)    

In [60]:
# packer adds a start token

def transform_tensor(tensor):
    # Flatten the tensor to make it 1-D
    tensor_flat = tf.reshape(tensor, [tf.shape(tensor)[0], -1])

    def process_batch(batch):
        # Find the indices of the non-zero elements in reverse order for each batch
        non_zero_indices = tf.where(batch != 0)[:, 0]

        if tf.shape(non_zero_indices)[0] == 0:
            # If all elements are zeros in this batch, no change is needed
            return batch

        # Find the index of the last non-zero element for each batch
        last_non_zero_index = tf.reduce_max(non_zero_indices)

        # Replace the last non-zero element with zero for each batch
        modified_batch_flat = tf.tensor_scatter_nd_update(batch, [[last_non_zero_index]], [0])

        return modified_batch_flat

    # Apply the function to each batch in the input tensor
    modified_tensor_flat = tf.map_fn(process_batch, tensor_flat, dtype=tf.int32)

    # Reshape the modified tensor back to its original shape
    modified_tensor = tf.reshape(modified_tensor_flat, tf.shape(tensor))

    return modified_tensor

def preprocess(inputs):
    outputs = start_packer(tokenizer(inputs))
    features = start_packer(tokenizer(inputs))
    
    X_train = transform_tensor(outputs)
    Y_train = features
    return X_train, Y_train


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

In [61]:
def transformer_block(heads, ffdim, dropout_rate, name):
    return keras_nlp.layers.TransformerDecoder(
    num_heads=heads,
    intermediate_dim=ffdim,
    dropout=dropout_rate,
    name=name,
    activation='gelu' # Use Gaussian Linear Unit (as in GPT-2)
  )

inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=MAX_VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
    name="emb"
)
x = embedding_layer(inputs)
# Transformer decoders.
x = transformer_block(NUM_HEADS, FEED_FORWARD_DIM, 0.6, "transformer-1")(x)  # Giving one argument only skips cross-attention.
x = transformer_block(NUM_HEADS, FEED_FORWARD_DIM, 0.6, "transformer-2")(x)  # Giving one argument only skips cross-attention.
x = transformer_block(NUM_HEADS, FEED_FORWARD_DIM, 0.7, "transformer-3")(x)  # Giving one argument only skips cross-attention.

# Output.
outputs = keras.layers.Dense(min(MAX_VOCAB_SIZE, len(vocab)), name="dense")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
# Learning rate scheduling
learning_rate_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001, decay_steps=1000, decay_rate=0.92
)
optimizer = keras.optimizers.Adam()

model.compile(optimizer=optimizer, loss=loss_fn, metrics=[perplexity])

In [62]:
model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, None)]            0         
                                                                 
 emb (TokenAndPositionEmbedd  (None, None, 256)        1345536   
 ing)                                                            
                                                                 
 transformer-1 (TransformerD  (None, None, 256)        1315072   
 ecoder)                                                         
                                                                 
 transformer-2 (TransformerD  (None, None, 256)        1315072   
 ecoder)                                                         
                                                                 
 transformer-3 (TransformerD  (None, None, 256)        1315072   
 ecoder)                                                   

In [63]:
model.set_weights(np.load('data/v3/models/checkpoint-25-V5.npy', allow_pickle=True))

In [64]:
prompt_tokens = start_packer(tokenizer(["$"]))
prompt_tokens

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>

In [65]:
def nxt(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    numpy_tensor = logits.numpy()
#     numpy_tensor[0][0] /= 1.5
    tensor_res = tf.convert_to_tensor(numpy_tensor)
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return tensor_res, hidden_states, cache

In [66]:
sampler = keras_nlp.samplers.TopPSampler(0.80)

In [67]:
output_tokens = sampler(
    next=nxt,
    prompt=prompt_tokens,
    index=1,
)
decoded_output = tokenizer.detokenize(output_tokens).numpy()

# Convert the numpy array of bytes to a byte string
byte_string = b"".join(decoded_output)

# Decode the byte string to a Unicode string
decoded_string = byte_string.decode("utf-8")
lines = decoded_string.split("[PAD]")[0].split("nl")
for line in lines:
    print(line.replace(' , ', ', ').replace(' . ', '. ').replace(' ! ','! ').replace(' ? ', '? ').replace(' - ', '-'))

[BOS] 
 за от сега, ти за тази нощ. 
 ти си онази дето ужас, а си ужас. .. 
 не искам да си до мен, 
 да си тръгнеш цяла нощ и ден, 
 да не знам, че ти си любовта, 
 пак не си до мен. . 
 знам не знам какво ти е така. 
 знам, че за мене ти си любовта. 
 знам, че ти си любовта. 
 знам, че искаш ме, 
 няма да не е лъжа. . 
 но във всеки ден, лъжа няма вече. . 
 знам, че ти си любовта, 
 но във мен не можем да бъдем пак ни сами. 
 знам, че с мене ти си любовта, 
 че след ден, ще я кръсти, 
 но да си до мен. 
 аз избрах да бъда безсънието нощем. 
 зная, че ти си любовта, 
 не 
