Įdiegiame reikalingas bibliotekas

In [1]:
!pip install -q --upgrade rouge-score
!pip install -q --upgrade keras-hub
!pip install -q --upgrade keras  # Upgrade to Keras 3.

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.1/792.1 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras-nlp 0.18.1 requires keras-hub==0.18.1, but you have keras-hub 0.20.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
keras-nlp 0.18.1 requires keras-hub==0.18.1, but you have keras-hub 0.20.0 which is incompatible.[0m[31m
[0m

Importuojame bibliotekas

In [2]:
import keras_hub
import pathlib
import random
import numpy
import keras
from keras import ops

import tensorflow.data as tf_data
from tensorflow_text.tools.wordpiece_vocab import (
    bert_vocab_from_dataset as bert_vocab,
)

Apsirašome hyperparametrus

In [None]:
ENG_VOCAB_SIZE = 40000
TGT_VOCAB_SIZE = 40000
MAX_SEQUENCE_LENGTH = 25
EPOCHS = 10
EMBED_DIM = 256
INTERMEDIATE_DIM = 2048
NUM_HEADS = 12
BATCH_SIZE = 64


Duomenų failai

In [4]:
text_file = "/content/spa.txt"
text_file_en = "/content/QED.en-lt.en"
text_file_lt = "/content/QED.en-lt.lt"

Duomenų paruošimas. Viršuje kodas skirtas lietuviškam rinkiniui, apačioje - ispaniškui.

In [5]:
# with open(text_file) as f:
#     lines = f.read().split("\n")[:-1]
# text_pairs = []
# for line in lines:
#     eng, spa = line.split("\t")
#     eng = eng.lower()
#     spa = spa.lower()
#     text_pairs.append((eng, spa))

with open(text_file_en) as f:
    lines_en = f.read().split("\n")[:-1]
with open(text_file_lt) as f:
    lines_lt = f.read().split("\n")[:-1]
text_pairs = []
for line_en, line_lt in zip(lines_en, lines_lt):
    line_en = line_en.lower()
    line_lt = line_lt.lower()
    text_pairs.append((line_en, line_lt))

Pažiūrime kelias poras

In [6]:
for _ in range(5):
    print(random.choice(text_pairs))

('then i got a call from new york city asking if i could adapt these concepts to times square or the high line.', 'po to aš sulaukiau skambučio iš new york\'o; manęs klausė, ar galėčiau pritaikyti savo kūrybą "times" aikštei ar "high line" parkui.')
('how much money does it take to do this?', 'kiek reikia pinigų šiam reikalui?')
("the man's business was a small one, and there was nothing in his house which could account for such elaborate preparations, and such an expenditure as they were at. it must, then, be something out of the house.", 'žmogaus veikla buvo maža ir nebuvo nieko jo namuose, kurie galėtų sudaryti, pavyzdžiui parengti preparatai, ir tokios išlaidos, kaip jie buvo. , tada jis turi būti kažkas iš namo.')
('when they were quite out of sight, phineas began to bestir himself.', 'kai jie buvo visiškai iš akių, phineas pradėjo sukrusti pats.')
('let me concentrate, close my eyes. come, come.', 'leiskite man susikaupti, aš užsimerkiu.')


Išskiriame į treniravimo, validacijos ir testų poras.

In [7]:
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples :]

print(f"{len(text_pairs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")


85435 total pairs
59805 training pairs
12815 validation pairs
12815 test pairs


Paruošiame tokenizer'ius su keras_hub biblioteka. Tokenizuoja "sub-words"

In [8]:

def train_word_piece(text_samples, vocab_size, reserved_tokens):
    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
        word_piece_ds.batch(1000).prefetch(2),
        vocabulary_size=vocab_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab


Specialūs tokenai: [PAD], [UNK], [START], [END].
Ištreniruojame žodynus.

In [None]:
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]

eng_samples = [text_pair[0] for text_pair in train_pairs]
eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)

tgt_samples = [text_pair[1] for text_pair in train_pairs]
tgt_vocab = train_word_piece(tgt_samples, TGT_VOCAB_SIZE, reserved_tokens)

Apsirašome tokenizerius su ištreniruotais žodynais.

In [None]:
eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=eng_vocab, lowercase=False
)
tgt_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=tgt_vocab, lowercase=False
)

Suformatuojame duomenų rinkinius.
Pridedame [PAD], bei [START] ir [END].
"make_dataset" - sukuria TensorFlow Dataset objektą iš sąrašų.

In [None]:

def preprocess_batch(eng, tgt):
    batch_size = ops.shape(tgt)[0]

    eng = eng_tokenizer(eng)
    tgt = tgt_tokenizer(tgt)

    eng = eng[:, :MAX_SEQUENCE_LENGTH]
    tgt = tgt[:, :MAX_SEQUENCE_LENGTH + 1]

    eng_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH,
        pad_value=eng_tokenizer.token_to_id("[PAD]"),
    )
    eng = eng_start_end_packer(eng)

    tgt_start_end_packer = keras_hub.layers.StartEndPacker(
        sequence_length=MAX_SEQUENCE_LENGTH + 1,
        start_value=tgt_tokenizer.token_to_id("[START]"),
        end_value=tgt_tokenizer.token_to_id("[END]"),
        pad_value=tgt_tokenizer.token_to_id("[PAD]"),
    )
    tgt = tgt_start_end_packer(tgt)

    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": tgt[:, :-1],
        },
        tgt[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, tgt_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    tgt_texts = list(tgt_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, tgt_texts))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
    return dataset.shuffle(2048).prefetch(16).cache()


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

Pažiūrime į gautus shape

In [12]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")


inputs["encoder_inputs"].shape: (64, 25)
inputs["decoder_inputs"].shape: (64, 25)
targets.shape: (64, 25)


Paruošiamas modelis naudojantis keras_hub bibliotekas.

Encoder:

įvestis, embedding sluoksnis, encoder išvestis su attention.

Decoder:

įvestys, embedding sluoksnis, decoder sluoksnis su attention, dropout, dense išvestis.

In [None]:
# Encoder
encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=ENG_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH,
    embedding_dim=EMBED_DIM,
)(encoder_inputs)

encoder_outputs = keras_hub.layers.TransformerEncoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(inputs=x)
encoder = keras.Model(encoder_inputs, encoder_outputs)


# Decoder
decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")

x = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=TGT_VOCAB_SIZE,
    sequence_length=MAX_SEQUENCE_LENGTH + 1, # Changed here
    embedding_dim=EMBED_DIM,
)(decoder_inputs)

x = keras_hub.layers.TransformerDecoder(
    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
x = keras.layers.Dropout(0.5)(x)
decoder_outputs = keras.layers.Dense(TGT_VOCAB_SIZE, activation="softmax")(x)
decoder = keras.Model(
    [
        decoder_inputs,
        encoded_seq_inputs,
    ],
    decoder_outputs,
)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])

transformer = keras.Model(
    [encoder_inputs, decoder_inputs],
    decoder_outputs,
    name="transformer",
)

Modelio treniravimas.

In [14]:
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)

Epoch 1/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 103ms/step - accuracy: 0.4615 - loss: 4.5031 - val_accuracy: 0.5150 - val_loss: 3.4608
Epoch 2/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 86ms/step - accuracy: 0.5211 - loss: 3.4285 - val_accuracy: 0.5370 - val_loss: 3.1623
Epoch 3/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 85ms/step - accuracy: 0.5392 - loss: 3.1678 - val_accuracy: 0.5482 - val_loss: 3.0067
Epoch 4/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 85ms/step - accuracy: 0.5517 - loss: 3.0025 - val_accuracy: 0.5575 - val_loss: 2.9244
Epoch 5/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 85ms/step - accuracy: 0.5640 - loss: 2.8818 - val_accuracy: 0.5616 - val_loss: 2.9162
Epoch 6/10
[1m935/935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 87ms/step - accuracy: 0.5751 - loss: 2.7855 - val_accuracy: 0.5685 - val_loss: 2.8665
Epoch 7/10
[

<keras.src.callbacks.history.History at 0x7c429c21f690>

Paduodame sakinį vertimui, gauname rezultatą

In [None]:
def decode_sequences(input_sentences):
    batch_size = 1

    encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
    encoder_input_tokens = encoder_input_tokens[:, :MAX_SEQUENCE_LENGTH]
    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
        pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
        encoder_input_tokens = ops.concatenate(
            [encoder_input_tokens, pads], 1
        )

    def next(prompt, cache, index):

        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
        hidden_states = None
        return logits, hidden_states, cache


    length = MAX_SEQUENCE_LENGTH + 1
    start = ops.full((batch_size, 1), tgt_tokenizer.token_to_id("[START]"))
    pad = ops.full((batch_size, length - 1), tgt_tokenizer.token_to_id("[PAD]"))
    prompt = ops.concatenate((start, pad), axis=-1)

    generated_tokens = keras_hub.samplers.GreedySampler()(
        next,
        prompt,
        stop_token_ids=[tgt_tokenizer.token_to_id("[END]")],
        index=1,  # Start sampling after start token.
    )
    generated_sentences = tgt_tokenizer.detokenize(generated_tokens)
    return generated_sentences

test_eng_texts = [pair[0] for pair in test_pairs]
for i in range(20):
    input_sentence = random.choice(test_eng_texts)
    translated = decode_sequences([input_sentence])[0]
    translated = translated.replace("[PAD]", "").replace("[START]", "").replace("[END]", "").strip()
    print(f"** Example {i} **")
    print(input_sentence)
    print(translated)
    print()

** Example 0 **
mercutlo come, sir, your passado.
mercutio ateik , pone , pone .

** Example 1 **
the truth may lie between these extremes.
tiesa yra melaptimi tarp šių kraštutinis .

** Example 2 **
the reason for this is both that freedom is in and of itself good, valuable, worthwhile, essential to being human.
dėl to , dėl to , kad tiek daug laiko ir vertinga , yra tokia svarbu , kad būtų svarbu būti svarbu būti

** Example 3 **
everybody would like to make people happier.
visi žmonės , kaip žmonės daro laimingesni .

** Example 4 **
"i knocked, but seemingly--"
" aš pasirkas , bet aš "

** Example 5 **
the first signal was for everybody to hold up these four-foot tall letters that spelled out "look up more," the name of the project.
pirmasis pasistogrįžtu , kad visi šie keturi atsirkas , kurie iš karto " ra

** Example 6 **
and in many places they are worth less than goats and cows.
ir daugelis yra tiek daug vietos , nei karbes ir karūrybims .

** Example 7 **
he is not supported b

Atliekame matavimus

In [17]:
rouge_1 = keras_hub.metrics.RougeN(order=1)
rouge_2 = keras_hub.metrics.RougeN(order=2)

from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []

for test_pair in test_pairs[:30]:
    input_sentence = test_pair[0]
    reference_sentence = test_pair[1]

    translated_sentence = decode_sequences([input_sentence])[0]
    translated_sentence = translated_sentence.replace("[PAD]", "").replace("[START]", "").replace("[END]", "").strip()

    rouge_1(reference_sentence, translated_sentence)
    rouge_2(reference_sentence, translated_sentence)


    reference = [reference_sentence.split()]
    candidate = translated_sentence.split()
    bleu_scores.append(sentence_bleu(reference, candidate))

print("ROUGE-1 Score: ", rouge_1.result())
print("ROUGE-2 Score: ", rouge_2.result())


print("Average BLEU Score: ", sum(bleu_scores) / len(bleu_scores))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.28708896040916443>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.2636148929595947>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.2651635706424713>}
ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.10509468615055084>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.10363973677158356>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.10097070783376694>}
Average BLEU Score:  7.617146588014677e-80


Išsaugome modelį kad galėtume naudoti vėliau

In [None]:
# transformer.save("english_to_lt_translatorhub4.keras")
# with open("englt_vocab4.txt", "w") as f:
#         for token in eng_vocab:
#             f.write(f"{token}\n")

# with open("ltu_vocab4.txt", "w") as f:
#         for token in tgt_vocab:
#             f.write(f"{token}\n")