In [2]:
!pip install datasets keras-nlp

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting keras-nlp
  Downloading keras_nlp-0.6.1-py3-none-any.whl (573 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.5/573.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━

In [3]:
from datasets import load_dataset
import tensorflow as tf
from tensorflow import keras
from keras import layers
import keras_nlp

Using TensorFlow backend


In [4]:
BATCH_SIZE = 64
EPOCHS = 10  # This should be at least 10 for convergence
MAX_SEQUENCE_LENGTH = 40
TR_VOCAB_SIZE = 15000
EN_VOCAB_SIZE = 15000

EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 8

reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

In [5]:
train_data = load_dataset("opus100", "en-tr", split="train[:500000]")
train_dataset = train_data.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
train_dataset = train_dataset.to_tf_dataset(50000, columns=["tr", "en"])

Downloading builder script:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/192k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/46.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.9M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

In [6]:
for x in train_dataset.take(1):
  print(x)

{'tr': <tf.Tensor: shape=(50000,), dtype=string, numpy=
array([b'Bir \xc5\x9fey buldum.', b'Hadi ama, ne s\xc3\xb6yledim ki?',
       b'Son sekiz saat i\xc3\xa7erisinde bloga neredeyse 2000 ki\xc5\x9fi bakm\xc4\xb1\xc5\x9f.',
       ...,
       b'\xc5\x9eu anda bir sald\xc4\xb1r\xc4\xb1 ger\xc3\xa7ekle\xc5\x9fiyor.',
       b'- Hi\xc3\xa7 arkada\xc5\x9f\xc4\xb1n yok,di mi?',
       b'Annem sanki...'], dtype=object)>, 'en': <tf.Tensor: shape=(50000,), dtype=string, numpy=
array([b'I got something.', b'Come on, what the hell did I say?',
       b'This blog has had nearly 2,000 hits in the last eight hours.',
       ..., b"That's right. There's an attack in progress.",
       b"- You don't have any friends, do you?", b'My mother was...'],
      dtype=object)>}


In [None]:
# tr_vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
#         train_dataset.map(lambda x: x['tr']).prefetch(2),
#         vocabulary_size=TR_VOCAB_SIZE,
#         reserved_tokens=reserved_tokens,
#     )
# 
# en_vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
#         train_dataset.map(lambda x: x['en']).prefetch(2),
#         vocabulary_size=EN_VOCAB_SIZE,
#         reserved_tokens=reserved_tokens,
#     )

In [None]:
# Save the vocabulary
# with open("tr_vocab.txt",'w') as f:
#   for word in tr_vocab:
#     f.write(f"{word}\n")
# 
# with open("en_vocab.txt",'w') as f:
#   for word in en_vocab:
#     f.write(f"{word}\n")

In [8]:
# Load the vocabulary
tr_vocab = []
en_vocab = []

with open("vocab/tr_vocab.txt",'r') as f:
  for word in f.readlines():
    if word.strip():
      tr_vocab.append(word.strip())

with open("vocab/en_vocab.txt",'r') as f:
  for word in f.readlines():
    if word.strip():
      en_vocab.append(word.strip())

In [9]:
print("English Tokens: ", en_vocab[0:11])
print("Turkish Tokens: ", tr_vocab[0:11])

English Tokens:  ['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&', "'"]
Turkish Tokens:  ['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&', "'"]


In [10]:
len(en_vocab)

14763

In [11]:
eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=set(en_vocab), lowercase=True
)
tr_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=set(tr_vocab), lowercase=True
)

In [12]:
eng_tokenizer.vocabulary_size()

14761

In [13]:
eng_input_ex = train_data['translation'][0]['en']
eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
print("English sentence: ", eng_input_ex)
print("Tokens: ", eng_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    eng_tokenizer.detokenize(eng_tokens_ex),
)

print()

tr_input_ex = train_data['translation'][0]['tr']
tr_tokens_ex = tr_tokenizer.tokenize(tr_input_ex)
print("Turkish sentence: ", tr_input_ex)
print("Tokens: ", tr_tokens_ex)
print(
    "Recovered text after detokenizing: ",
    tr_tokenizer.detokenize(tr_tokens_ex),
)

English sentence:  I got something.
Tokens:  tf.Tensor([9141  970 3072 9474], shape=(4,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'i got something .', shape=(), dtype=string)

Turkish sentence:  Bir şey buldum.
Tokens:  tf.Tensor([ 539  353 7524 9481], shape=(4,), dtype=int32)
Recovered text after detokenizing:  tf.Tensor(b'bir \xc5\x9fey buldum .', shape=(), dtype=string)


In [14]:
# Pad `tr` to `MAX_SEQUENCE_LENGTH`.
tr_start_end_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=MAX_SEQUENCE_LENGTH,
    pad_value=tr_tokenizer.token_to_id("[PAD]"),
)

# Add special tokens (`"[START]"` and `"[END]"`) to `en` and pad it as well.
en_start_end_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=MAX_SEQUENCE_LENGTH + 1,
    start_value=eng_tokenizer.token_to_id("[START]"),
    end_value=eng_tokenizer.token_to_id("[END]"),
    pad_value=eng_tokenizer.token_to_id("[PAD]"),
)

def preprocess_batch(tr, en):

    tr = tr_tokenizer(tr)
    tr = tr_start_end_packer(tr)


    en = eng_tokenizer(en)
    en = en_start_end_packer(en)

    return (
        {
            "encoder_inputs": tr,
            "decoder_inputs": en[:, :-1],
        },
        en[:, 1:],
    )


def make_dataset(dataset):
    dataset = dataset.map(lambda x: (x['tr'],x['en'])).unbatch()
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16)


train_ds = make_dataset(train_dataset)

test_dataset = load_dataset("opus100", "en-tr", split="test")
test_dataset = test_dataset.map(lambda x: {"tr": x["translation"]['tr'], "en": x["translation"]['en']})
test_dataset = test_dataset.remove_columns(["translation"])
test_dataset = test_dataset.to_tf_dataset(BATCH_SIZE, columns=["tr", "en"])
val_ds = make_dataset(test_dataset)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (64, 40)
inputs["decoder_inputs"].shape: (64, 40)
targets.shape: (64, 40)


In [16]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=tr_tokenizer.vocabulary_size(),
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(encoder_inputs)

encoder_outputs = keras_nlp.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)

encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=EN_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)(decoder_inputs)

x = keras_nlp.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(eng_tokenizer.vocabulary_size(), activation="softmax")(x)

decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)


In [17]:
transformer.summary()
transformer.compile(
    "adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"],jit_compile=True
)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 token_and_position_embeddi  (None, None, 256)            3783936   ['encoder_inputs[0][0]']      
 ng (TokenAndPositionEmbedd                                                                       
 ing)                                                                                             
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                  

In [18]:
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7882b13e1de0>

In [21]:
import random
def decode_sequences(input_sentences):
    batch_size = tf.shape(input_sentences)[0]

    # Tokenize the encoder input.
    encoder_input_tokens = tr_tokenizer(input_sentences).to_tensor(
        shape=(None, MAX_SEQUENCE_LENGTH)
    )

    # Define a function that outputs the next token's probability given the
    # input sequence.
    def next(prompt, cache, index):
        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        # Ignore hidden states for now; only needed for contrastive search.
        hidden_states = None
        return logits, hidden_states, cache

    # Build a prompt of length 40 with a start token and padding tokens.
    length = 40
    start = tf.fill((batch_size, 1), eng_tokenizer.token_to_id("[START]"))
    pad = tf.fill((batch_size, length - 1), eng_tokenizer.token_to_id("[PAD]"))
    prompt = tf.concat((start, pad), axis=-1)

    generated_tokens = keras_nlp.samplers.GreedySampler()(
        next,
        prompt,
        end_token_id=eng_tokenizer.token_to_id("[END]"),
        index=1,  # Start sampling after start token.
    )
    generated_sentences = eng_tokenizer.detokenize(generated_tokens)
    return generated_sentences


test_eng_texts = ["Merhaba ben nusret","Ben öğrenci olmak istiyorum","Ve ben ölüm oldum"]
for i, text in enumerate(test_eng_texts):
    translated = decode_sequences(tf.constant([text]))
    translated = translated.numpy()[0].decode("utf-8")
    translated = (
        translated.replace("[PAD]", "")
        .replace("[START]", "")
        .replace("[END]", "")
        .strip()
    )
    print(f"** Example {i} **")
    print(text)
    print(translated)
    print()

** Example 0 **
Merhaba ben nusret
i say hello to the nas i . i . i ' m not a little .

** Example 1 **
Ben öğrenci olmak istiyorum
i want to be a student i want to be a student , i ' m a student , i ' m a student , i ' m a student .

** Example 2 **
Ve ben ölüm oldum
and i ' m death . . . i ' m a death .


In [22]:
transformer.save("transformer_model")

In [28]:
!zip -r transformer_model.zip transformer_model/

  adding: transformer_model/ (stored 0%)
  adding: transformer_model/fingerprint.pb (stored 0%)
  adding: transformer_model/saved_model.pb (deflated 89%)
  adding: transformer_model/assets/ (stored 0%)
  adding: transformer_model/keras_metadata.pb (deflated 93%)
  adding: transformer_model/variables/ (stored 0%)
  adding: transformer_model/variables/variables.data-00000-of-00001 (deflated 15%)
  adding: transformer_model/variables/variables.index (deflated 75%)
