In [3]:
# -*- coding: utf-8 -*-
"""
Transformer encoder-decoder (sửa lỗi, chú thích tiếng Việt).
Dùng với file input_texts.txt và label_texts.txt (mỗi dòng 1 câu).
Chạy được trên môi trường local (Colab/Local Jupyter).
"""

import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import numpy as np
import os

# ---------------------------
# 1) Hàm tiền xử lý văn bản (chuẩn hoá)
# ---------------------------
def custom_standardization(text):
    # chuyển về chữ thường và giữ chữ/số/khoảng trắng và các token [sos]/[eos]
    text = tf.strings.lower(text)
    # loại bỏ ký tự lạ (giữ a-z, 0-9, khoảng trắng và [] để giữ [sos] [eos])
    text = tf.strings.regex_replace(text, r'[^a-z0-9\s\[\]]', '')
    # gộp khoảng trắng nhiều thành 1
    text = tf.strings.regex_replace(text, r'\s+', ' ')
    return text

# ---------------------------
# 2) Đọc dữ liệu từ file (local)
# ---------------------------
def load_and_prepare_data(input_file, label_file, max_samples=25000):
    with open(input_file, 'r', encoding='utf-8') as f:
        input_texts = f.read().splitlines()
    with open(label_file, 'r', encoding='utf-8') as f:
        label_texts = f.read().splitlines()

    # loại bỏ các dòng rỗng
    input_texts = [t.strip() for t in input_texts if t.strip()]
    label_texts = [t.strip() for t in label_texts if t.strip()]

    # đồng bộ số mẫu (nếu 2 file có độ dài khác nhau)
    n = min(len(input_texts), len(label_texts), max_samples)
    input_texts = input_texts[:n]
    label_texts = label_texts[:n]

    # thêm token bắt đầu/kết thúc
    input_texts = [f"[sos] {t} [eos]" for t in input_texts]
    label_texts = [f"[sos] {t} [eos]" for t in label_texts]

    print(f"Đã nạp {n} mẫu.")
    return input_texts, label_texts

# ---------------------------
# 3) Tạo TextVectorization (vectorizer)
# ---------------------------
def create_text_vectorizer(texts, max_tokens=10000, max_len=40):
    vectorizer = TextVectorization(
        max_tokens=max_tokens,
        output_sequence_length=max_len,
        standardize=custom_standardization
    )
    vectorizer.adapt(texts)
    return vectorizer

# ---------------------------
# 4) Positional Encoding
# ---------------------------
class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super().__init__()
        pos = np.arange(max_len)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_encoding = angle_rads[np.newaxis, ...]  # (1, max_len, d_model)
        self.pos_encoding = tf.cast(pos_encoding, tf.float32)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]

# ---------------------------
# 5) Multi-Head Attention Layer
# ---------------------------
class MultiHeadAttentionLayer(layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        self.dense = Dense(d_model)

    def split_heads(self, x):
        batch_size = tf.shape(x)[0]
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])  # (batch, heads, seq_len, depth)

    def scaled_dot_product_attention(self, q, k, v, mask):
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_logits = matmul_qk / tf.math.sqrt(dk)
        if mask is not None:
            scaled_logits += (mask * -1e9)
        attention_weights = tf.nn.softmax(scaled_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        return output, attention_weights

    def call(self, v, k, q, mask):
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)
        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)
        scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, (tf.shape(q)[0], -1, self.d_model))
        output = self.dense(concat_attention)
        return output, attention_weights

# ---------------------------
# 6) EncoderLayer & DecoderLayer
# ---------------------------
class EncoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttentionLayer(d_model, num_heads)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation='relu'), Dense(d_model)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, mask, training=False):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class DecoderLayer(layers.Layer):
    def __init__(self, d_model, num_heads, ff_dim, dropout_rate=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttentionLayer(d_model, num_heads)
        self.mha2 = MultiHeadAttentionLayer(d_model, num_heads)
        self.ffn = tf.keras.Sequential([Dense(ff_dim, activation='relu'), Dense(d_model)])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
        self.dropout3 = Dropout(dropout_rate)

    def call(self, x, enc_output, look_ahead_mask, padding_mask, training=False):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)
        return out3, attn_weights_block1, attn_weights_block2

# ---------------------------
# 7) Mask helper (sinh mask với shape broadcast được)
# ---------------------------
def create_padding_mask(seq):
    mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]  # (batch,1,1,seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return tf.cast(mask, tf.float32)[tf.newaxis, tf.newaxis, :, :]  # (1,1,size,size)

# ---------------------------
# 8) Transformer (encoder-decoder)
# ---------------------------
class Transformer(Model):
    def __init__(self, num_layers, d_model, num_heads, ff_dim,
                 input_vocab_size, target_vocab_size, max_len, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.max_len = max_len

        self.encoder_embedding = Embedding(input_vocab_size, d_model)
        self.decoder_embedding = Embedding(target_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(max_len, d_model)

        self.encoder_layers = [EncoderLayer(d_model, num_heads, ff_dim, dropout_rate) for _ in range(num_layers)]
        self.decoder_layers = [DecoderLayer(d_model, num_heads, ff_dim, dropout_rate) for _ in range(num_layers)]
        self.dropout = Dropout(dropout_rate)
        self.final_layer = Dense(target_vocab_size)  # logits

    def call(self, inputs, training=False):
        input_seq, target_seq = inputs
        enc_padding_mask = create_padding_mask(input_seq)
        dec_padding_mask = create_padding_mask(input_seq)
        look_ahead_mask = create_look_ahead_mask(tf.shape(target_seq)[1])
        dec_target_padding_mask = create_padding_mask(target_seq)

        # Encoder
        enc_emb = self.encoder_embedding(input_seq) * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        enc_emb = self.pos_encoding(enc_emb)
        enc_emb = self.dropout(enc_emb, training=training)
        enc_output = enc_emb
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, enc_padding_mask, training=training)

        # Decoder
        dec_emb = self.decoder_embedding(target_seq) * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        dec_emb = self.pos_encoding(dec_emb)
        dec_emb = self.dropout(dec_emb, training=training)

        dec_output = dec_emb
        # Tạo mask self-attention cho decoder: kết hợp look-ahead và padding target
        # dec_target_padding_mask: (batch,1,1,dec_seq) -> tile theo chiều seq để thành (batch,1,dec_seq,dec_seq)
        for dec_layer in self.decoder_layers:
            batch = tf.shape(target_seq)[0]
            dec_seq_len = tf.shape(target_seq)[1]
            # tile padding mask
            dec_pad_tiled = tf.tile(dec_target_padding_mask, [1, 1, dec_seq_len, 1])  # (batch,1,dec_seq,dec_seq)
            combined_look_ahead_mask = tf.maximum(look_ahead_mask, dec_pad_tiled)      # (batch,1,dec_seq,dec_seq)
            dec_output, _, _ = dec_layer(dec_output, enc_output, combined_look_ahead_mask, dec_padding_mask, training=training)

        final_output = self.final_layer(dec_output)  # logits
        return final_output

# ---------------------------
# 9) Hàm decode (inference) dùng vectorizer
# ---------------------------
def decode_sequence(model, input_vectorizer, label_vectorizer, input_text, max_len=40):
    vocab = label_vectorizer.get_vocabulary()
    try:
        start_token = vocab.index('[sos]')
        end_token = vocab.index('[eos]')
    except ValueError:
        start_token = 1
        end_token = 2

    input_seq = input_vectorizer([f"[sos] {input_text.strip()} [eos]"])
    input_seq = tf.cast(input_seq, tf.int32)

    output_seq = tf.expand_dims([start_token], 0)  # (1,1)
    for _ in range(max_len):
        predictions = model(inputs=[input_seq, output_seq], training=False)  # (1, seq_len, vocab)
        predictions = predictions[:, -1:, :]  # (1,1,vocab)
        predicted_id = tf.argmax(predictions, axis=-1)
        pid = int(predicted_id.numpy()[0][0])
        if pid == end_token:
            break
        output_seq = tf.concat([output_seq, tf.cast(predicted_id, dtype=output_seq.dtype)], axis=-1)

    tokens = [vocab[i] if i < len(vocab) else '' for i in output_seq.numpy()[0]]
    text = ' '.join(tokens).replace('[sos]', '').replace('[eos]', '').strip()
    return text

# ---------------------------
# 10) Pipeline chính: nạp dữ liệu, vectorize, train
# ---------------------------
if __name__ == "__main__":
    # Đường dẫn file (local)
    input_file = "input_texts.txt"
    label_file = "label_texts.txt"

    # Siêu tham số (giảm để chạy nhanh)
    max_samples = 5000
    max_length = 40
    d_model = 128
    num_heads = 4
    ff_dim = 512
    num_layers = 2
    dropout_rate = 0.1
    epochs = 4     # test nhanh: 4 epoch
    batch_size = 64

    # Load dữ liệu
    input_texts, label_texts = load_and_prepare_data(input_file, label_file, max_samples)

    # Tạo vectorizers
    input_vectorizer = create_text_vectorizer(input_texts, max_tokens=10000, max_len=max_length)
    label_vectorizer = create_text_vectorizer(label_texts, max_tokens=10000, max_len=max_length)

    # Vectorize
    input_data = input_vectorizer(input_texts)
    input_data = tf.cast(input_data, tf.int32)
    label_data = label_vectorizer(label_texts)
    label_data = tf.cast(label_data, tf.int32)

    # Chuẩn bị decoder input và target (shift right)
    label_input_data = label_data[:, :-1]
    label_target_data = label_data[:, 1:]

    # Kích thước từ vựng
    input_vocab_size = len(input_vectorizer.get_vocabulary())
    target_vocab_size = len(label_vectorizer.get_vocabulary())
    print(f"Số mẫu: {input_data.shape[0]} | Vocab input: {input_vocab_size} | Vocab target: {target_vocab_size}")

    # Tạo dataset TF
    train_dataset = tf.data.Dataset.from_tensor_slices(((input_data, label_input_data), label_target_data))
    train_dataset = train_dataset.shuffle(1024).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    # Build model
    transformer = Transformer(
        num_layers=num_layers,
        d_model=d_model,
        num_heads=num_heads,
        ff_dim=ff_dim,
        input_vocab_size=input_vocab_size,
        target_vocab_size=target_vocab_size,
        max_len=max_length,
        dropout_rate=dropout_rate
    )

    # Loss và metric: dùng sparse categorical crossentropy từ logits, và mask pad
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

    def masked_loss(y_true, y_pred):
        mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
        loss = loss_fn(y_true, y_pred)
        loss *= mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)

    def masked_accuracy(y_true, y_pred):
        y_pred_ids = tf.argmax(y_pred, axis=-1, output_type=tf.int32)
        matches = tf.cast(tf.equal(y_true, y_pred_ids), tf.float32)
        mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)
        matches *= mask
        return tf.reduce_sum(matches) / tf.reduce_sum(mask)

    transformer.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss=masked_loss,
        metrics=[masked_accuracy]
    )

    # Train
    transformer.fit(train_dataset, epochs=epochs)

    # Lưu model nếu muốn:
    # transformer.save_weights("transformer_weights.h5")

    # Kiểm thử inference vài câu
    test_sentences = ["hello", "how are you", "what is your name", "tell me a joke"]
    for s in test_sentences:
        print("Input:", s)
        resp = decode_sequence(transformer, input_vectorizer, label_vectorizer, s, max_len=40)
        print("Response:", resp)
        print("-"*40)


FileNotFoundError: [Errno 2] No such file or directory: 'input_texts.txt'