In [None]:
!pip install datasets keras-nlp

In [1]:
from datasets import load_dataset
import tensorflow as tf
from tensorflow import keras
from keras import layers
import keras_nlp

Using TensorFlow backend


In [2]:
BATCH_SIZE = 64
EPOCHS = 10  # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 40
TR_VOCAB_SIZE = 15000
EN_VOCAB_SIZE = 15000

EMBED_DIM = 256
INTERMEDIATE_DIM = 2048

reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

In [3]:
train_data = load_dataset("opus100", "en-tr", split="train[:500000]")
train_dataset = train_data.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
train_dataset = train_dataset.to_tf_dataset(50000, columns=["tr", "en"])

In [4]:
# Load the vocabulary
tr_vocab = []
en_vocab = []

with open("tr_vocab.txt",'r') as f:
  for word in f.readlines():
    tr_vocab.append(word.strip())

with open("en_vocab.txt",'r') as f:
  for word in f.readlines():
    en_vocab.append(word.strip())

In [5]:
eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=set(en_vocab), lowercase=True
)
tr_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=set(tr_vocab), lowercase=True
)

In [6]:
# Pad `tr` to `MAX_SEQUENCE_LENGTH`.
tr_start_end_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=MAX_SEQUENCE_LENGTH,
    pad_value=tr_tokenizer.token_to_id("[PAD]"),
)

# Add special tokens (`"[START]"` and `"[END]"`) to `en` and pad it as well.
en_start_end_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=MAX_SEQUENCE_LENGTH + 1,
    start_value=eng_tokenizer.token_to_id("[START]"),
    end_value=eng_tokenizer.token_to_id("[END]"),
    pad_value=eng_tokenizer.token_to_id("[PAD]"),
)

def preprocess_batch(tr, en):
    batch_size = tf.shape(en)[0]

    tr = tr_tokenizer(tr)
    tr = tr_start_end_packer(tr)


    en = eng_tokenizer(en)
    en = en_start_end_packer(en)

    return (
        {
            "encoder_inputs": tr,
            "decoder_inputs": en[:, :-1],
        },
        en[:, 1:],
    )


def make_dataset(dataset):
    dataset = dataset.map(lambda x: (x['tr'],x['en'])).unbatch()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16)


train_ds = make_dataset(train_dataset)

test_dataset = load_dataset("opus100", "en-tr", split="test")
test_dataset = test_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
test_dataset = test_dataset.remove_columns(["translation"])
test_dataset = test_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])
val_ds = make_dataset(test_dataset)

In [None]:
for x in train_ds.take(1):
  print(x)

In [7]:
class Encoder(layers.Layer):
    def __init__(self, enc_units:int):
        super(Encoder, self).__init__()
        self.gru = layers.Bidirectional(
            merge_mode="sum",
            layer=layers.GRU(enc_units)
        )

    def call(self, inputs, training=None, mask=None):
        output = self.gru(inputs,mask=mask)
        return output


class Decoder(layers.Layer):
    def __init__(self, vocab_size:int, dec_units:int):
        super(Decoder, self).__init__()
        self.gru = layers.GRU(dec_units, return_sequences=True)
        self.fc = layers.Dense(vocab_size)

    def call(self, inputs, training=None, mask=None):
        # X is the target sentence, y is the encoder output
        x, state = inputs
        output = self.gru(x, initial_state=state,mask=mask[0])
        x = self.fc(output)
        return x


class Seq2Seq(tf.keras.Model):
    def __init__(self, encoder, decoder,enc_vocab_size,dec_vocab_size, embedding_dim):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_embedding = layers.Embedding(enc_vocab_size, embedding_dim, mask_zero=True)
        self.decoder_embedding = layers.Embedding(dec_vocab_size, embedding_dim, mask_zero=True)

    def call(self, inputs, training=None, mask=None):
        x, y = inputs['encoder_inputs'], inputs['decoder_inputs']
        x = self.encoder_embedding(x)
        y = self.decoder_embedding(y)

        state = self.encoder(x)
        y = self.decoder((y, state))
        return y


def masked_loss(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    # Mask off the losses on padding.
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    # Return the total.
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)


def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match) / tf.reduce_sum(mask)

enc = Encoder(enc_units=INTERMEDIATE_DIM)
dec = Decoder(dec_units=INTERMEDIATE_DIM,vocab_size=eng_tokenizer.vocabulary_size())
optimizer = tf.keras.optimizers.Adam()

model = Seq2Seq(encoder=enc, decoder=dec,enc_vocab_size=tr_tokenizer.vocabulary_size(),dec_vocab_size=eng_tokenizer.vocabulary_size(),embedding_dim=EMBED_DIM)
model.compile(optimizer=optimizer, loss=masked_loss, metrics=[masked_acc], run_eagerly=False)

In [None]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

In [None]:
model.summary()

In [27]:
def translate(input_text, max_length=50):

    input_text = tf.convert_to_tensor([input_text])
        # Tokenize the encoder input.
    encoder_input = tr_tokenizer(input_text).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )
    decoder_input = tf.expand_dims([eng_tokenizer.token_to_id("[START]")], 0)


    end = False
    results = []
    while not end:
        output = model({"encoder_inputs": encoder_input, "decoder_inputs":decoder_input})
        output = tf.argmax(output[:,-1,:], -1,output_type=tf.int32)
        results.append(output.numpy()[0])

        if output.numpy()[0] == eng_tokenizer.token_to_id("[END]") or len(results) >= max_length:
            end = True
        output = tf.reshape(output,[-1,1])
        decoder_input = tf.concat([decoder_input,output],axis=-1)

    return eng_tokenizer.detokenize(results)

print(translate("Merhaba ben nusret"))
print(translate("Ben öğrenci olmak istiyorum"))
print(translate("Ve ben ölüm oldum"))

tf.Tensor(b"hey , what ' s the fuck ? [END]", shape=(), dtype=string)
tf.Tensor(b"timrist , i ' m a little bit of the time . [END]", shape=(), dtype=string)
tf.Tensor(b"and i ' m not a couple of the world . [END]", shape=(), dtype=string)
