In [226]:
import tensorflow as tf
import json
import pandas as pd
import numpy as np
import unicodedata
import regex
from rouge_score import rouge_scorer
from tensorflow import keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Bidirectional, Attention
from tensorflow.keras.preprocessing.sequence import pad_sequences
import editdistance
from IPython.display import display, HTML
import string
import eng_to_ipa as ipa
import pronouncing

In [3]:
# Carregamento de Vocabulário e Modelo

with open("model20/char_mappings_20.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

char2idx = {k: v for k, v in metadata["char2idx"].items()}
idx2char = {int(k): v for k, v in metadata["idx2char"].items()}

class Seq2Seq(Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.vocab_size = vocab_size  
        self.embedding_dim = embedding_dim
        self.units = units

        self.embedding = Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.encoder = Bidirectional(LSTM(units, return_sequences=True, return_state=True, dropout=0.4))
        self.decoder = LSTM(units * 2, return_sequences=True, return_state=True, dropout=0.4)
        self.attention = Attention()
        self.fc = Dense(vocab_size, activation='softmax')

    def call(self, inputs):
        encoder_input, decoder_input = inputs
        enc_emb = self.embedding(encoder_input)
        dec_emb = self.embedding(decoder_input)

        enc_output, forward_h, forward_c, backward_h, backward_c = self.encoder(enc_emb)
        state_h = tf.concat([forward_h, backward_h], axis=-1)
        state_c = tf.concat([forward_c, backward_c], axis=-1)

        dec_output, _, _ = self.decoder(dec_emb, initial_state=[state_h, state_c])

        # Melhorando a máscara
        encoder_mask = tf.cast(tf.math.not_equal(encoder_input, 0), tf.float32)
        encoder_mask = tf.expand_dims(encoder_mask, axis=1)

        context_vector = self.attention([dec_output, enc_output], mask=[None, encoder_mask])
        combined = tf.concat([dec_output, context_vector], axis=-1)
        output = self.fc(combined)
        return output

    def get_config(self):
        config = super().get_config()
        config.update({
            "vocab_size": self.vocab_size,
            "embedding_dim": self.embedding_dim,
            "units": self.units
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(
            vocab_size=config["vocab_size"],
            embedding_dim=config["embedding_dim"],
            units=config["units"]
        )

model = keras.models.load_model("model20/phonetic_transcriber_20.keras", custom_objects={"Seq2Seq": Seq2Seq}, compile=False)




In [256]:
#  Funções suporte

DIACRITICOS_CUSTOMIZADOS = {
    '\u0317': 'red',     # ̗  - acute below
    '\u0333': 'gray',    # ̳  - double underline
    '\u0330': 'blue',    # ̰  - tilde below
    '\u031C': 'green'    # ̜  - half ring below
}

def letter_coloring(text):
    result = []
    graphemes = regex.findall(r"\X", text)
    for g in graphemes:
        decomposed = unicodedata.normalize("NFD", g)
        base = decomposed[0]
        diacritics = decomposed[1:]
        custom_diacritic = next((d for d in diacritics if d in DIACRITICOS_CUSTOMIZADOS), None)
        if custom_diacritic:
            color = DIACRITICOS_CUSTOMIZADOS[custom_diacritic]
            result.append(f'<span style="color:{color}; font-weight:bold">{base}</span>')
        else:
            result.append(g)
    return ''.join(result)

def cer(reference, hypothesis):
    if len(reference) == 0:
        return 1.0 if len(hypothesis) > 0 else 0.0
    return editdistance.eval(reference, hypothesis) / len(reference)

def calculate_rouge(reference, prediction):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    return scorer.score(reference, prediction)

def text_to_seq(text):
    return [char2idx[char] for char in text]

def text_to_seq_phonetic(text):
    return [char2idx["<start>"]] + [char2idx[char] for char in text] + [char2idx["<end>"]]

def beam_search_decode(model, input_seq, beam_width=5, max_length_decoder=150):
    start_token = char2idx["<start>"]
    end_token = char2idx["<end>"]

    enc_emb = model.embedding(input_seq)
    enc_output, forward_h, forward_c, backward_h, backward_c = model.encoder(enc_emb)
    state_h = tf.concat([forward_h, backward_h], axis=-1)
    state_c = tf.concat([forward_c, backward_c], axis=-1)

    sequences = [[[], 1.0, state_h, state_c]]

    for _ in range(max_length_decoder):
        all_candidates = []
        for seq, score, h, c in sequences:
            if seq and seq[-1] == end_token:
                all_candidates.append((seq, score, h, c))
                continue

            decoder_input = np.array([[seq[-1]]] if seq else [[start_token]])
            dec_emb = model.embedding(decoder_input)
            dec_output, new_h, new_c = model.decoder(dec_emb, initial_state=[h, c])

            encoder_mask = tf.cast(tf.math.not_equal(input_seq, 0), tf.float32)
            encoder_mask = tf.expand_dims(encoder_mask, axis=1)
            context_vector = model.attention([dec_output, enc_output], mask=[None, encoder_mask])

            combined = tf.concat([dec_output, context_vector], axis=-1)
            pred = model.fc(combined).numpy()[0, -1, :]

            top_k_idx = np.argsort(pred)[-beam_width:]
            for idx in top_k_idx:
                new_seq = seq + [idx]
                new_score = score * pred[idx]
                all_candidates.append((new_seq, new_score, new_h, new_c))

        sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

    best_seq = sequences[0][0] # saindo em lista de índices
    return best_seq

In [254]:
#  Avaliação de Métricas

def evaluate_transcription(model, text, expected_transcription, beam_width=5, max_length_decoder=150):
    input_seq = pad_sequences([text_to_seq(text)], maxlen=100, padding="post")
    best_seq = beam_search_decode(model, input_seq, beam_width, max_length_decoder)
    predicted_transcription = "".join(idx2char.get(idx, "?") for idx in best_seq if idx not in [char2idx["<start>"], char2idx["<end>"]])

    rouge_scores = calculate_rouge(expected_transcription, predicted_transcription)
    cer_score = cer(expected_transcription, predicted_transcription)
    accuracy = 1.0 if predicted_transcription == expected_transcription else 0.0

    return {
        "Entrada": text,
        "Saída Prevista": predicted_transcription,
        "Saída Esperada": expected_transcription,
        "ROUGE-1 F1": rouge_scores["rouge1"].fmeasure,
        "ROUGE-2 F1": rouge_scores["rouge2"].fmeasure,
        "ROUGE-L F1": rouge_scores["rougeL"].fmeasure,
        "CER": cer_score,
        "Acurácia": accuracy
    }

In [113]:
#  Avaliando a Base

df = pd.read_csv("words_and_frases_database_v8.csv")
df.dropna(inplace=True)

def categorizar_tamanho(frase):
    n = len(frase.split())
    if n == 1:
        return "Única palavra"
    elif 2 <= n <= 3:
        return "Curta (2–3 palavras)"
    elif 4 <= n <= 6:
        return "Média (4–6 palavras)"
    else:
        return "Longa (7+ palavras)"

df["categoria_tamanho"] = df["english"].apply(categorizar_tamanho)

n_amostras_por_categoria = 25
amostra = (
    df.groupby("categoria_tamanho", group_keys=False)
      .apply(lambda x: x.sample(n=n_amostras_por_categoria, random_state=42))
)

expected_transcriptions = {
    phrase.strip().lower(): phonetic.strip()
    for phrase, phonetic in zip(amostra["english"], amostra["phonetic"])
}

results = []
for phrase, expected_transcription in expected_transcriptions.items():
    metrics = evaluate_transcription(model, phrase, expected_transcription)
    results.append(metrics)

rouge_df = pd.DataFrame(results)
rouge_df[["ROUGE-1 F1", "ROUGE-2 F1", "ROUGE-L F1", "CER", "Acurácia"]].mean()

ROUGE-1 F1    0.985238
ROUGE-2 F1    0.946667
ROUGE-L F1    0.985238
CER           0.011069
Acurácia      0.910000
dtype: float64

In [115]:
#  Avaliação por Tamanho de Frase

def avaliar_acuracia_por_tamanho(model, df, beam_width=5, max_length_decoder=50):
    def avaliar_amostra(amostra):
        resultados = []
        for frase, esperado in zip(amostra["english"], amostra["phonetic"]):
            resultado = evaluate_transcription(
                model, frase.strip().lower(), esperado.strip(),
                beam_width=beam_width, max_length_decoder=max_length_decoder
            )
            resultados.append(resultado["Acurácia"])
        return round(np.mean(resultados), 4)

    # df["N_palavras"] = df["english"].apply(lambda x: len(str(x).split()))

    faixas = {
        "Única palavra": df[df["categoria_tamanho"] == 'Única palavra'].sample(n=100, random_state=42),
        "Curta (2-3)": df[df["categoria_tamanho"] == 'Curta (2–3 palavras)'].sample(n=100, random_state=42),
        "Média (4-6)": df[df["categoria_tamanho"] == 'Média (4–6 palavras)'].sample(n=100, random_state=42),
        "Longa (7+)": df[df["categoria_tamanho"] == 'Longa (7+ palavras)'].sample(n=100, random_state=42),
    }

    resultados_finais = {}
    for faixa, amostra in faixas.items():
        acuracia_media = avaliar_amostra(amostra)
        resultados_finais[faixa] = acuracia_media

    return resultados_finais

# Avaliar
resultados_por_tamanho = avaliar_acuracia_por_tamanho(model, df)
resultados_por_tamanho

{'Única palavra': 0.86,
 'Curta (2-3)': 0.89,
 'Média (4-6)': 0.89,
 'Longa (7+)': 0.96}

In [208]:
def substituir_apostrofo_para_inferencia(texto):
    """
    Substitui o apóstrofo tradicional ' por um símbolo reconhecido pelo modelo
    no momento da inferência.
    """
    if texto is None:
        return texto
    
    # Substituir o apóstrofo ' (U+0027) por o que o modelo conhece
    # Aqui vamos substituir pelo acento agudo ´ (U+00B4), que no seu mapeamento é o idx 48.
    return texto.replace("'", "´")


In [249]:
def frase_para_arpabet(frase):
    palavras = frase.split()
    resultado = []
    
    for palavra in palavras:
        pronuncias = pronouncing.phones_for_word(palavra.lower())
        if pronuncias:
            resultado.append(pronuncias[0])
        else:
            resultado.append("[UNK]")  # Se não encontrar a palavra no dicionário

    return ' - '.join(resultado)

In [272]:
test_phrases = [
    # Curtas (2-3 palavras)
    "think fast",                        # /θ/, T forte
    "red bus",                           # /ɹ/
    "what's next",                       # T aspirado, flap T
    
    # Médias (4–7 palavras)
    "take the first right",              # /tʰ/, /ð/, /ɹ/
    "where's the nearest pharmacy",      # /ɹ/, /ð/, nasal
    "please turn on the lights",         # /ɹ/, atenuação
    "this is my third attempt",          # /θ/, /ð/, nasal final
    "he's thinking about leaving soon",  # /θ/, nasal final

    # Longas (8+ palavras)
    "could you tell me where the station is",          # /tʰ/, /ð/, /ɹ/, atenuação
    "i thought that taking the shortcut was smarter",  # /θ/, /ð/, /ɹ/, T aspirado
    "they're planning to throw a surprise party soon", # /θ/, /ɹ/, nasal
    "what time should we meet at the cafe today",       # /tʰ/, /ɹ/, /ð/, nasal final
    "the train arrives earlier than expected sometimes" # /tʰ/, /ɹ/, nasal
]


test_results = []

for frase in test_phrases:
    entrada_preparada = substituir_apostrofo_para_inferencia(frase)
    input_seq = text_to_seq(entrada_preparada)  
    input_seq = pad_sequences([input_seq], maxlen=100, padding='post')
 
    predicted_indices = beam_search_decode(model, input_seq, beam_width=5)

    raw_transcription = ''.join(idx2char.get(idx, '') for idx in predicted_indices)

    transcription_colored = letter_coloring(raw_transcription)


    display(HTML(f"<b>Entrada do Modelo:</b> {frase}"))
    display(HTML(f"<b>Saída do Modelo:</b> {transcription_colored}"))
    # display(HTML(f"<b>Transcrição limpa:</b> {raw_transcription}\n"))
    display(HTML(f"<b>Transcrição fonética IPA:</b> {ipa.convert(frase)}"))
    display(HTML(f"<b>Transcrição fonética ARPAbet:</b> {frase_para_arpabet(frase)}"))
    print('')
    

    test_results.append({
        "Frase ": frase,
        "Saída Prevista (Raw)": entrada_preparada,
        "Saída Prevista (Colorida)": transcription_colored
    })






































