In [12]:
import os
import random

import numpy as np
import tensorflow as tf

In [13]:
BASE_DIR = "../"

In [14]:
TOY_DATA_DIR = BASE_DIR + "data/wmt14_de_en/toy_data/"
TRAIN_EN_TOY = TOY_DATA_DIR + "train.toy.en"
TRAIN_DE_TOY = TOY_DATA_DIR + "train.toy.de"
VAL_EN_TOY = TOY_DATA_DIR + "val.toy.en"
VAL_DE_TOY = TOY_DATA_DIR + "val.toy.de"
TEST_EN_TOY = TOY_DATA_DIR + "test.toy.en"
TEST_DE_TOY = TOY_DATA_DIR + "test.toy.de"

In [15]:
def create_toy_dataset(src_file, tgt_file, train_ratio=0.8, val_ratio=0.1, sample_fraction=0.01):
    # Ensure output directory exists
    os.makedirs(TOY_DATA_DIR, exist_ok=True)

    # Load a fraction of data
    with open(src_file, "r", encoding="utf-8") as src_f, open(tgt_file, "r", encoding="utf-8") as tgt_f:
        src_lines = src_f.readlines()
        tgt_lines = tgt_f.readlines()

    # Ensure both files have the same length
    assert len(src_lines) == len(tgt_lines), "Source and target files must have the same number of lines."

    # Sample a subset of lines
    sample_size = int(len(src_lines) * sample_fraction)
    sampled_indices = random.sample(range(len(src_lines)), sample_size)
    sampled_src = [src_lines[i] for i in sampled_indices]
    sampled_tgt = [tgt_lines[i] for i in sampled_indices]

    # Split into train, val, test
    train_size = int(len(sampled_src) * train_ratio)
    val_size = int(len(sampled_src) * val_ratio)

    train_src, val_src, test_src = sampled_src[:train_size], sampled_src[train_size:train_size + val_size], sampled_src[train_size + val_size:]
    train_tgt, val_tgt, test_tgt = sampled_tgt[:train_size], sampled_tgt[train_size:train_size + val_size], sampled_tgt[train_size + val_size:]

    # Save toy datasets
    with open(TRAIN_EN_TOY, "w", encoding="utf-8") as f:
        f.writelines(train_src)
    with open(TRAIN_DE_TOY, "w", encoding="utf-8") as f:
        f.writelines(train_tgt)
    with open(VAL_EN_TOY, "w", encoding="utf-8") as f:
        f.writelines(val_src)
    with open(VAL_DE_TOY, "w", encoding="utf-8") as f:
        f.writelines(val_tgt)
    with open(TEST_EN_TOY, "w", encoding="utf-8") as f:
        f.writelines(test_src)
    with open(TEST_DE_TOY, "w", encoding="utf-8") as f:
        f.writelines(test_tgt)

    print(f"Toy dataset created with {sample_size} samples (train: {train_size}, val: {val_size}, test: {len(test_src)})")

In [16]:
TRAIN_EN_FILE = BASE_DIR + "data/wmt14_de_en/train.tok.clean.bpe.32000.en"
TRAIN_DE_FILE = BASE_DIR + "data/wmt14_de_en/train.tok.clean.bpe.32000.de"
VOCAB_FILE = BASE_DIR + "data/wmt14_de_en/vocab.bpe.32000"
VOCAB_SIZE = 32000

EMBEDDING_DIM = 256
HIDDEN_UNITS = 512
BATCH_SIZE = 1
EPOCHS = 10

In [17]:
create_toy_dataset(TRAIN_EN_FILE, TRAIN_DE_FILE, train_ratio=0.8, val_ratio=0.1, sample_fraction=0.001)

Toy dataset created with 4500 samples (train: 3600, val: 450, test: 450)


In [18]:
# Step 1: Load the vocabulary and add <sos> and <eos> tokens if not present
def load_vocab(vocab_file):
    vocab = {}
    with open(vocab_file, "r", encoding="utf-8") as f:
        for idx, token in enumerate(f):
            vocab[token.strip()] = idx
    # Add special tokens if they are not in the vocabulary
    if "<sos>" not in vocab:
        vocab["<sos>"] = len(vocab)
    if "<eos>" not in vocab:
        vocab["<eos>"] = len(vocab)
    if "<unk>" not in vocab:
        vocab["<unk>"] = len(vocab)
    return vocab

# Load vocabulary and map tokens to integer IDs
vocab = load_vocab(VOCAB_FILE)
vocab_size = len(vocab)

# Use the <sos> token ID for the decoder input in training
SOS_TOKEN_ID = vocab["<sos>"]

# Step 2: Load and process BPE tokenized data
def load_bpe_data(file_path, vocab):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Convert each token to its ID, or to <unk> if it's not in the vocabulary
            token_ids = [vocab.get(token, vocab["<unk>"]) for token in line.strip().split()]
            data.append(token_ids)
    # Pad sequences to have the same length
    return tf.keras.preprocessing.sequence.pad_sequences(data, padding="post")

# Load English and German sequences
input_sequences = load_bpe_data(TEST_EN_TOY, vocab)
target_sequences = load_bpe_data(TEST_DE_TOY, vocab)

# Step 3: Create a TensorFlow dataset with batching
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, target_sequences))
dataset = dataset.shuffle(buffer_size=len(input_sequences)).batch(BATCH_SIZE, drop_remainder=True)

print(f"Loaded dataset with {len(input_sequences)} samples.")


Loaded dataset with 450 samples.


In [19]:
CHECKPOINT_DIR = BASE_DIR + "checkpoints"
CHECKPOINT_FILEPATH = CHECKPOINT_DIR + "/seq2seq_weights"

import os
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

In [20]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True)

    def call(self, x):
        x = self.embedding(x)
        output, state = self.gru(x)
        return output, state

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units):
        super(Decoder, self).__init__()
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(dec_units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)

    def call(self, x, enc_output, hidden):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        x = self.fc(output)
        return x, state, attention_weights

# Instantiate encoder, decoder, and optimizer
encoder = Encoder(vocab_size, EMBEDDING_DIM, HIDDEN_UNITS)
decoder = Decoder(vocab_size, EMBEDDING_DIM, HIDDEN_UNITS)
optimizer = tf.keras.optimizers.Adam()

checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

# Define the loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

# Step 5: Modify Training Step Function to correct the shape mismatch
@tf.function
def train_step(input_seq, target_seq):
    loss = 0
    print("Running train step...") 
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(input_seq)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([SOS_TOKEN_ID] * BATCH_SIZE, 1)  # Start token

        for t in range(1, target_seq.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, enc_output, dec_hidden)
            
            # Remove the extra dimension from predictions to match target shape
            predictions = tf.squeeze(predictions, axis=1)  # Shape: (batch_size, vocab_size)
            
            # Calculate the loss
            loss += loss_function(target_seq[:, t], predictions)
            
            # Use the true target as the next input to the decoder
            dec_input = tf.expand_dims(target_seq[:, t], 1)

    batch_loss = (loss / int(target_seq.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss

In [21]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
EPOCHS = 10  # Set the number of epochs as needed

for epoch in range(EPOCHS):
    total_loss = 0
    
    for (batch, (input_seq, target_seq)) in enumerate(dataset):
        batch_loss = train_step(input_seq, target_seq)
        total_loss += batch_loss
        
    print(f"Epoch {epoch + 1}, Loss: {total_loss.numpy() / (batch + 1)}")
    
    # Save weights at the end of each epoch
    checkpoint.save(file_prefix=CHECKPOINT_FILEPATH)
    print(f"Model weights saved for epoch {epoch + 1}")

Running train step...


KeyboardInterrupt: 

: 