In [1]:
import numpy as np

exports_dir = 'exports/en-tw-transformer-nmt'
asset_dir = '../../data/en-tw'
training_sample = 1_000
num_attention_layers = 4
batch_size = 32

In [2]:
import os

exports_dir = os.getcwd() + '/' + exports_dir

In [3]:
from datetime import date

model_plot = f'{exports_dir}/en-tw.png'
checkpoint_filepath = exports_dir + f'/checkpoint.weights.{date.today()}.' + '{epoch:02d}-{val_loss:.2f}.keras'
checkpoint_filepath_best = exports_dir + '/checkpoint.weights.best.keras'
model_file = f'{exports_dir}/models.pkb'

In [4]:
import os

os.makedirs(exports_dir, exist_ok=True)

In [5]:
from archive.lib.language_index import LanguageIndex

input_max_timesteps = 128
target_max_timesteps = 128

inp_lang = LanguageIndex('en', f'{asset_dir}/en.vocabs.json', f'{asset_dir}/en.txt', input_max_timesteps)
targ_lang = LanguageIndex('tw', f'{asset_dir}/tw.vocabs.json', f'{asset_dir}/tw.txt', target_max_timesteps)
print(inp_lang, '\n', targ_lang)

LanguageIndex { name: en, tokenizer: <tokenizers.Tokenizer object at 0x15353be00>, vocab_size: 30000 } 
 LanguageIndex { name: tw, tokenizer: <tokenizers.Tokenizer object at 0x12324d800>, vocab_size: 15000 }


In [6]:
def generate_seq2seq_batches(batch_size=32):
    """
    Generates batches of data for sequence-to-sequence models in the format:
    ((src_language_inputs, dest_language_inputs), dest_language_targets)
    
    Args:
    - src_language_index (LanguageIndex): An instance of LanguageIndex for the source language.
    - dest_language_index (LanguageIndex): An instance of LanguageIndex for the destination language.
    - batch_size (int): The number of sequences per batch.
    
    Yields:
    - A tuple of ((src_language_inputs, dest_language_inputs), dest_language_targets) for each batch.
    """
    # Initialize the generators for both source and destination languages
    src_generator = inp_lang.data(batch_size=batch_size)
    dest_generator = targ_lang.data(batch_size=batch_size)

    while True:
        try:
            src_batch_inputs = next(src_generator)
            dest_batch_inputs = next(dest_generator)

            dest_batch_targets = np.hstack((dest_batch_inputs[:, 1:], np.zeros((dest_batch_inputs.shape[0], 1))))

            yield (src_batch_inputs, dest_batch_inputs), dest_batch_targets

        except StopIteration:
            break

# Usage example
# Assuming `src_language_index` and `dest_language_index` are instances of LanguageIndex for their respective languages
# for ((src_inputs, dest_inputs), dest_targets) in generate_seq2seq_batches(src_language_index, dest_language_index, batch_size=32):
#     # Process the batches here, e.g., feed them to a sequence-to-sequence model


In [7]:
from archive.lib.utils import tf_embedding_scale, create_positional_encoding
from archive.lib.layers.dense import FeedForward
from archive.lib.layers.attention import SelfAttention, MaskedSelfAttention, CrossAttention
from keras import layers
from archive.lib.models import graph

embed_dim = 512
input_vocab_size = 30_000
target_vocab_size = 15_000

position_vectors = create_positional_encoding(max_seq_len=target_max_timesteps, embed_dim=embed_dim)

model = graph.Graph(
    inputs=[
        layers.Input(shape=(input_max_timesteps,), name='english_input'),
        layers.Input(shape=(target_max_timesteps,), name='twi_input')
    ],
    layers=[
        # Encoder Positional Embedding
        layers.Embedding(name='encoder_embedding', input_dim=input_vocab_size, output_dim=embed_dim, mask_zero=True),
        layers.Lambda(name='encoder_embeddings_scaled', function=tf_embedding_scale(embed_dim)),
        layers.Lambda(name='encoder_positions',
                      function=lambda x: x + position_vectors[:x.shape[0], :x.shape[1], :x.shape[2]]),

        layers.Dropout(name='encoder_dropout', rate=0.1),

        SelfAttention(name="encoder_attention", num_heads=8, key_dim=embed_dim, dropout=0.1),
        FeedForward(name="encoder_feedforward", d_model=embed_dim, dff=2048),

        # Decoder Positional Embedding
        layers.Embedding(name='decoder_embedding', input_dim=target_vocab_size, output_dim=embed_dim,
                         mask_zero=True),
        layers.Lambda(name='decoder_embeddings_scaled', function=tf_embedding_scale(embed_dim)),
        layers.Lambda(name='decoder_positions',
                      function=lambda x: x + position_vectors[:x.shape[0], :x.shape[1], :x.shape[2]]),

        layers.Dropout(name='decoder_dropout', rate=0.1),

        MaskedSelfAttention(name="decoder_masked_attention", num_heads=8, key_dim=embed_dim, dropout=0.1),
        CrossAttention(name="decoder_cross_attention", num_heads=8, key_dim=embed_dim, dropout=0.1),
        FeedForward(name="decoder_feedforward", d_model=embed_dim, dff=2048),

        layers.Dense(name='output', units=target_vocab_size)
    ],
    connections=[
        # Encoder
        ('english_input', 'encoder_embedding'),
        ('encoder_embedding', 'encoder_embeddings_scaled'),
        ('encoder_embeddings_scaled', 'encoder_positions'),

        ('encoder_positions', 'encoder_dropout'),
        ('encoder_dropout', 'encoder_attention'),
        ('encoder_attention', 'encoder_feedforward'),

        # Bridge - Encoder's output goes to decoder as context
        ('encoder_feedforward', 'decoder_cross_attention[context]'),

        # Decoder
        ('twi_input', 'decoder_embedding'),
        ('decoder_embedding', 'decoder_embeddings_scaled'),
        ('decoder_embeddings_scaled', 'decoder_positions'),

        ('decoder_positions', 'decoder_dropout'),
        ('decoder_dropout', 'decoder_masked_attention'),
        ('decoder_masked_attention', 'decoder_cross_attention[x]'),
        ('decoder_cross_attention', 'decoder_feedforward'),

        ('decoder_feedforward', 'output')
    ]
)

In [8]:
model.summary()

Model: "graph"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english_input (InputLayer)  [(None, 128)]                0         []                            
                                                                                                  
 twi_input (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, 128, 512)             1536000   ['english_input[0][0]']       
 ng)                                                      0                                       
                                                                                                  
 decoder_embedding (Embeddi  (None, 128, 512)             7680000   ['twi_input[0][0]']       

In [9]:
if os.path.exists(checkpoint_filepath_best):
    model.load_weights(checkpoint_filepath_best)
    print("Loaded checkpoint from", checkpoint_filepath_best)
else:
    print("No checkpoint at", checkpoint_filepath_best)

Loaded checkpoint from /Users/emmanuelsarpong/Projects/Extra/DataScience/fcc-llm/projects/en-tw-transformer-nmt/exports/en-tw-transformer-nmt/checkpoint.weights.best.keras


In [10]:
from archive.lib.utils import masked_loss, TransformerLearningRateSchedule
from keras import optimizers

model.compile(
    loss=masked_loss,
    optimizer=optimizers.legacy.Adam(
        TransformerLearningRateSchedule(embed_dim), beta_1=0.9, beta_2=0.98, epsilon=1e-9)
)

In [11]:
# from keras import utils
# 
# utils.plot_model(model, to_file=f'{exports_dir}/en-tw.png', show_shapes=True, show_layer_names=True,
#                  expand_nested=True)

In [12]:
from keras import callbacks

checkpoint_callback = callbacks.ModelCheckpoint(filepath=checkpoint_filepath, save_best_only=True,
                                                save_weights_only=True)
checkpoint_callback_best = callbacks.ModelCheckpoint(filepath=checkpoint_filepath_best, save_best_only=True,
                                                     save_weights_only=True)

In [13]:
import itertools

train_generator = generate_seq2seq_batches(batch_size)
validation_generator = generate_seq2seq_batches(batch_size / 2)
itertools.islice(train_generator, 20)

model.fit(
    train_generator,
    epochs=10,
    validation_data=validation_generator,
    steps_per_epoch=training_sample / batch_size,
    validation_steps=10,
    callbacks=[checkpoint_callback_best, checkpoint_callback]
)

Epoch 1/10
 1/31 [..............................] - ETA: 2:26 - loss: 9.0019

KeyboardInterrupt: 