In [None]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Embedding, Dense, Bidirectional, Attention, AdditiveAttention, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.saving import register_keras_serializable
from tensorflow.keras.mixed_precision import set_global_policy
import json
from sklearn.model_selection import train_test_split

In [None]:
%%time
# Verificar se há GPU disponível
if tf.config.list_physical_devices('GPU'):
    print("✅ Rodando na GPU") 
else:
    print("⚠️ Rodando na CPU")


# Carregar e preparar o dataset
df = pd.read_csv("/kaggle/input/database-v2/words_and_frases_database_v8.csv")
df = df.dropna()


# Pré-processamento de Dados

# Tokens especiais para o decoder
special_tokens = ["<start>", "<end>"]

# Criar vocabulário a partir das colunas "english" e "phonetic" e adicionar tokens especiais
base_vocab = set("".join(df["english"]) + "".join(df["phonetic"]))
char_vocab = sorted(list(base_vocab)) + special_tokens

# Mapeamento de caracteres para índices (iniciando em 1; 0 é reservado para padding)
char2idx = {char: idx for idx, char in enumerate(char_vocab, start=1)}
idx2char = {idx: char for char, idx in char2idx.items()}


# Função para converter texto em sequência numérica (para "english")
def text_to_seq(text):
    return [char2idx[char] for char in text]

# Função para converter texto em sequência numérica com tokens especiais (para "phonetic")
def text_to_seq_phonetic(text):
    return [char2idx["<start>"]] + [char2idx[char] for char in text] + [char2idx["<end>"]]

# Converter as colunas
df["english_seq"] = df["english"].apply(text_to_seq)
df["phonetic_seq"] = df["phonetic"].apply(text_to_seq_phonetic)

# Preparar os inputs para o decoder:
# - decoder_input: sequência phonetic sem o token final
# - decoder_target: sequência phonetic sem o token de início
df["decoder_input"] = df["phonetic_seq"].apply(lambda seq: seq[:-1])
df["decoder_target"] = df["phonetic_seq"].apply(lambda seq: seq[1:])

# Aplicar padding nas sequências
max_length_encoder = df["english_seq"].apply(len).max()
max_length_decoder = df["decoder_input"].apply(len).max()

df["english_seq"] = pad_sequences(df["english_seq"], maxlen=max_length_encoder, padding="post").tolist()
df["decoder_input"] = pad_sequences(df["decoder_input"], maxlen=max_length_decoder, padding="post").tolist()
df["decoder_target"] = pad_sequences(df["decoder_target"], maxlen=max_length_decoder, padding="post").tolist()


# 1. Categorizar as frases por tamanho
def categorizar_tamanho(frase):
    n = len(frase.split())
    if n == 1:
        return "Única palavra"
    elif 2 <= n <= 3:
        return "Curta (2–3 palavras)"
    elif 4 <= n <= 6:
        return "Média (4–6 palavras)"
    else:
        return "Longa (7+ palavras)"

df["categoria_tamanho"] = df["english"].apply(categorizar_tamanho)

# 2. Divisão estratificada
train_data, test_data = train_test_split(
    df,
    test_size=0.2,
    stratify=df["categoria_tamanho"],
    random_state=40
)


train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)


# Criar tf.data.Datasets utilizando os conjuntos separados
train_dataset = tf.data.Dataset.from_tensor_slices(
    ((train_data["english_seq"].tolist(), train_data["decoder_input"].tolist()), train_data["decoder_target"].tolist())
)
train_dataset = train_dataset.shuffle(10000).padded_batch(256, padded_shapes=(([None], [None]), [None]))

test_dataset = tf.data.Dataset.from_tensor_slices(
    ((test_data["english_seq"].tolist(), test_data["decoder_input"].tolist()), test_data["decoder_target"].tolist())
)
test_dataset = test_dataset.padded_batch(256, padded_shapes=(([None], [None]), [None]))

In [None]:

# Definição do Modelo Seq2Seq com Atenção e Teacher Forcing

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Attention
import tensorflow as tf

class Seq2Seq(Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.vocab_size = vocab_size   
        self.embedding_dim = embedding_dim
        self.units = units

        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.encoder = Bidirectional(LSTM(units, return_sequences=True, return_state=True, dropout=0.4))
        self.decoder = LSTM(units * 2, return_sequences=True, return_state=True, dropout=0.4)
        self.attention = Attention()
        self.fc = Dense(vocab_size, activation='softmax') 

    def call(self, inputs):
        encoder_input, decoder_input = inputs
        enc_emb = self.embedding(encoder_input)
        dec_emb = self.embedding(decoder_input)

        enc_output, forward_h, forward_c, backward_h, backward_c = self.encoder(enc_emb)
        state_h = tf.concat([forward_h, backward_h], axis=-1)
        state_c = tf.concat([forward_c, backward_c], axis=-1)

        dec_output, _, _ = self.decoder(dec_emb, initial_state=[state_h, state_c])

        # Melhorando a máscara
        encoder_mask = tf.cast(tf.math.not_equal(encoder_input, 0), tf.float32)
        encoder_mask = tf.expand_dims(encoder_mask, axis=1)

        context_vector = self.attention([dec_output, enc_output], mask=[None, encoder_mask])
        combined = tf.concat([dec_output, context_vector], axis=-1)
        output = self.fc(combined)
        return output

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "embedding_dim": self.embedding_dim,
            "units": self.units
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(
            vocab_size=config["vocab_size"],
            embedding_dim=config["embedding_dim"],
            units=config["units"]
        )



# Configuração e Treinamento

embedding_dim = 256
units = 256
vocab_size = len(char2idx) + 1  # +1 para o padding (índice 0)

    
model = Seq2Seq(vocab_size, embedding_dim, units)
optimizer = AdamW(learning_rate=0.0005, weight_decay=1e-4)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
model.compile(optimizer=optimizer, loss=loss_fn)
model.summary()


early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)


epochs = 100
model.fit(train_dataset, 
          validation_data=test_dataset,
          callbacks=[early_stopping, lr_scheduler],
          epochs=epochs)

In [None]:
model.save("phonetic_transcriber_20.keras")