In [2]:
from keras import layers
from datasets import load_dataset
import tensorflow as tf

def standardization(input_string):
    text = tf.strings.lower(input_string)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')

    return text

def process_text(inputs):
    context = inputs["tr"]
    target = inputs["en"]

    context = tr_vectorizer(context).to_tensor()

    target = en_vectorizer(target)
    targ_in = target[:, :-1].to_tensor()
    targ_out = target[:, 1:].to_tensor()
    return (context, targ_in), targ_out


BATCH_SIZE = 64
train_dataset = load_dataset("opus100", "en-tr", split="train[:500000]")
train_dataset = train_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
train_dataset = train_dataset.remove_columns(["translation"])
train_dataset = train_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])

tr_vectorizer = layers.TextVectorization(max_tokens=10000, standardize=standardization, ragged=True)
tr_vectorizer.adapt(train_dataset.map(lambda x: x["tr"], num_parallel_calls=tf.data.AUTOTUNE))
print(tr_vectorizer.get_vocabulary()[:10])

en_vectorizer = layers.TextVectorization(max_tokens=10000, standardize=standardization, ragged=True)
en_vectorizer.adapt(train_dataset.map(lambda x: x["en"], num_parallel_calls=tf.data.AUTOTUNE))
print(en_vectorizer.get_vocabulary()[:10])

train_dataset = train_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)


test_dataset = load_dataset("opus100", "en-tr", split="test")
test_dataset = test_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
test_dataset = test_dataset.remove_columns(["translation"])
test_dataset = test_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])
test_dataset = test_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)

2023-09-12 17:53:28.672771: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-12 17:53:32.291135: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-09-12 17:53:32.313838: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-09-12 17:53:32.313885: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2d:00.0/numa_node
Your kernel may have been bu

['', '[UNK]', '[START]', '[END]', '.', ',', '?', 'bir', '!', 'bu']
['', '[UNK]', '[START]', '[END]', '.', ',', 'you', 'the', '?', 'i']


In [3]:
class Encoder(layers.Layer):
    def __init__(self, enc_units:int):
        super(Encoder, self).__init__()
        self.gru = layers.Bidirectional(
            merge_mode="sum",
            layer=layers.GRU(enc_units,return_sequences=True)
        )

    def call(self, inputs, training=None, mask=None):
        output = self.gru(inputs,mask=mask)
        return output
    
    


class Attention(layers.Layer):

    def __init__(self, units):
        super(Attention, self).__init__()
        self.mha = layers.MultiHeadAttention(num_heads=1, key_dim=units)
        self.layernorm = layers.LayerNormalization()
        self.add = layers.Add()

    def call(self, inputs, *args, **kwargs):
        x, context_sequences = inputs
        attention_output = self.mha(query=x, value=context_sequences, key=context_sequences)
        x = self.add([attention_output, x])
        x = self.layernorm(x)
        return x


class Decoder(layers.Layer):
    def __init__(self, vocab_size, dec_units):
        super(Decoder, self).__init__()
        self.gru = layers.GRU(dec_units, return_sequences=True, return_state=True)
        self.fc = layers.Dense(vocab_size)
        self.attention = Attention(dec_units)

    def call(self, inputs, training=None, mask=None, state=None, return_state=False):
        y, context_sequences = inputs
        output, state = self.gru(y, initial_state=state,mask=mask)

        output = self.attention([output, context_sequences])
        x = self.fc(output)
        if return_state:
            return x, state
        else:
            return x


class Seq2Seq(tf.keras.Model):
    def __init__(self, encoder, decoder,enc_vocab_size,dec_vocab_size, embedding_dim):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_embedding = layers.Embedding(enc_vocab_size, embedding_dim, mask_zero=True)
        self.decoder_embedding = layers.Embedding(dec_vocab_size, embedding_dim, mask_zero=True)

    def call(self, inputs, training=None, mask=None):
        x, y = inputs
        x = self.encoder_embedding(x)
        y = self.decoder_embedding(y)
        
        state = self.encoder(x)
        y = self.decoder((y, state))
        return y

optimizer = tf.keras.optimizers.Adam()

def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match) / tf.reduce_sum(mask)

DIM = 256
enc = Encoder(enc_units=DIM)
dec = Decoder(vocab_size=en_vectorizer.vocabulary_size(), dec_units=DIM)


model = Seq2Seq(encoder=enc, decoder=dec,enc_vocab_size=tr_vectorizer.vocabulary_size(),dec_vocab_size=en_vectorizer.vocabulary_size(),embedding_dim=DIM)
model.compile(optimizer=optimizer, loss=masked_loss, metrics=[masked_acc], run_eagerly=False)

In [4]:
model.fit(train_dataset, epochs=15, validation_data=test_dataset)

Epoch 1/15


2023-09-12 17:53:53.361117: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_14' with dtype string
	 [[{{node Placeholder/_14}}]]
2023-09-12 17:53:53.361385: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_17' with dtype int64
	 [[{{node Placeholder/_17}}]]
2023-09-12 17:53:55.518685: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/ReverseV2_grad/ReverseV2/ReverseV2/axis' with dtype int32 and shape [1




KeyboardInterrupt



In [None]:
def translate(input_text, max_length=50):

    input_text = tf.convert_to_tensor([input_text])
    input_text = tr_vectorizer(input_text)
    encoder_input = input_text.to_tensor()
    decoder_input = tf.expand_dims([2], 0)

    encoder = model.layers[0]
    decoder = model.layers[1]

    encoder_states = encoder(encoder_input)
    state = None

    end = False
    results = []
    while not end:
        output, state = decoder([decoder_input, encoder_states], return_state=True,state=state)
        output = tf.argmax(output, -1)
        print(output)
        results.append(output.numpy()[0, 0])

        if output.numpy()[0][0] == 3 or len(results) >= max_length:
            end = True
        decoder_input = output

    results =[en_vectorizer.get_vocabulary()[i] for i in results]
    return " ".join(results)

print(translate("Ve ben ölüm oldum"))

In [None]:
"student" in en_vectorizer.get_vocabulary()

In [None]:
# IDEA: Train word2vec on both languages, then use the word2vec embeddings as input to the encoder and decoder instead of random embeddings.
# IDEA: Use better tokenization method such as BPE.