In [None]:
from keras import layers
from datasets import load_dataset
import tensorflow as tf
from tqdm import tqdm

def standardization(input_string):
    text = tf.strings.lower(input_string)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')

    return text

def process_text(inputs):
    context = inputs["tr"]
    target = inputs["en"]

    context = tr_vectorizer(context).to_tensor()

    target = en_vectorizer(target)
    # This is what we will give to the RNN, from [START] until before [END]
    targ_in = target[:, :-1].to_tensor()
    # This is what we want from the RNN to output, from 1 (after [START]) until [END]
    targ_out = target[:, 1:].to_tensor()
    
    return (context, targ_in), targ_out


BATCH_SIZE = 64
train_dataset = load_dataset("opus100", "en-tr", split="train[:500000]")
train_dataset = train_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
train_dataset = train_dataset.remove_columns(["translation"])
train_dataset = train_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])

tr_vectorizer = layers.TextVectorization(max_tokens=30000, standardize=standardization, ragged=True)
tr_vectorizer.adapt(train_dataset.map(lambda x: x["tr"]))
print(tr_vectorizer.get_vocabulary()[:10])

en_vectorizer = layers.TextVectorization(max_tokens=30000, standardize=standardization, ragged=True)
en_vectorizer.adapt(train_dataset.map(lambda x: x["en"]))
print(en_vectorizer.get_vocabulary()[:10])

train_dataset = train_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)


test_dataset = load_dataset("opus100", "en-tr", split="test")
test_dataset = test_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
test_dataset = test_dataset.remove_columns(["translation"])
test_dataset = test_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])
test_dataset = test_dataset.map(process_text, num_parallel_calls=tf.data.AUTOTUNE)


In [None]:
class Encoder(layers.Layer):
    def __init__(self, enc_units:int):
        super(Encoder, self).__init__()
        self.gru = layers.Bidirectional(
            merge_mode="sum",
            layer=layers.GRU(enc_units)
        )

    def call(self, inputs, training=None, mask=None):
        output = self.gru(inputs,mask=mask)
        return output


class Decoder(layers.Layer):
    def __init__(self, vocab_size:int, dec_units:int):
        super(Decoder, self).__init__()
        self.gru = layers.GRU(dec_units, return_sequences=True)
        self.fc = layers.Dense(vocab_size)

    def call(self, inputs, training=None, mask=None):
        # X is the target sentence, y is the encoder output
        x, state = inputs
        output = self.gru(x, initial_state=state,mask=mask[0])
        x = self.fc(output)
        return x


class Seq2Seq(tf.keras.Model):
    def __init__(self, encoder, decoder,enc_vocab_size,dec_vocab_size, embedding_dim):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_embedding = layers.Embedding(enc_vocab_size, embedding_dim, mask_zero=True)
        self.decoder_embedding = layers.Embedding(dec_vocab_size, embedding_dim, mask_zero=True)

    def call(self, inputs, training=None, mask=None):
        x, y = inputs
        x = self.encoder_embedding(x)
        y = self.decoder_embedding(y)
        
        state = self.encoder(x)
        y = self.decoder((y, state))
        return y


def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match) / tf.reduce_sum(mask)

DIM = 256
enc = Encoder(enc_units=DIM)
dec = Decoder(dec_units=DIM,vocab_size=en_vectorizer.vocabulary_size())
optimizer = tf.keras.optimizers.Adam()

model = Seq2Seq(encoder=enc, decoder=dec,enc_vocab_size=tr_vectorizer.vocabulary_size(),dec_vocab_size=en_vectorizer.vocabulary_size(),embedding_dim=DIM)
model.compile(optimizer=optimizer, loss=masked_loss, metrics=[masked_acc], run_eagerly=False)
model.fit(train_dataset, validation_data=test_dataset, epochs=10)


In [None]:
model.fit(train_dataset, validation_data=test_dataset, epochs=10)

In [None]:
def translate(input_text, max_length=50):

    input_text = tf.convert_to_tensor([input_text])
    input_text = tr_vectorizer(input_text)
    encoder_input = input_text.to_tensor()
    decoder_input = tf.expand_dims([2], 0)

    encoder = model.layers[0]
    decoder = model.layers[1]

    states = encoder(encoder_input)

    end = False
    results = []
    while not end:
        output, states = decoder([decoder_input, states])
        output = tf.argmax(output, -1)
        print(output)
        results.append(output.numpy()[0, 0])

        if output.numpy()[0][0] == 3 or len(results) >= max_length:
            end = True
        decoder_input = output

    results =[en_vectorizer.get_vocabulary()[i] for i in results]
    return " ".join(results)

print(translate("Öğrenci olmak istiyorum."))

In [None]:
"student" in en_vectorizer.get_vocabulary()

In [None]:
BATCH_SIZE = 64
EPOCHS = 5  # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 40
TR_VOCAB_SIZE = 15000
EN_VOCAB_SIZE = 15000

EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]