In [1]:
import re

words = []
phonemes = []

# Datei lesen
file_path = './cmudict.txt'

with open(file_path, 'r') as file:
    for line in file:
        # Trennen des Wortes und der Phoneme beim ersten Leerzeichen
        word, phoneme_string = line.strip().split(' ', 1)
        word = re.sub(r'\(.*?\)', '', word)
        
        # Überprüfen, ob das Wort unerwünschte Zeichen enthält
        if re.search(r'[\d_\-ÀÉ]', word):
            continue
        
        phoneme_list = phoneme_string.split()
        # Entfernen der Betonungszahlen aus den Phonemen
        cleaned_phonemes = [re.sub(r'\d', '', phoneme) for phoneme in phoneme_list]
        words.append(word)
        phonemes.append(' '.join(cleaned_phonemes).split(' '))



In [16]:
all_chars = list(set(''.join(words)))
all_phonemes = list(set(' '.join([' '.join(phoneme) for phoneme in phonemes]).split()))


In [21]:
def encode_phonemes(phoneme_list, encoder):
    # Encode each list of phonemes
    encoded_phonemes = [encoder.transform(phoneme) for phoneme in phoneme_list]
    return encoded_phonemes

In [18]:
from sklearn.preprocessing import LabelEncoder

# Tokenisierung der Wörter und Phoneme
word_encoder = LabelEncoder()
phoneme_encoder = LabelEncoder()

word_encoder.fit(all_chars)
phoneme_encoder.fit(all_phonemes)

# Anzahl der verschiedenen Zeichen und Phoneme
num_chars = len(word_encoder.classes_)
num_phonemes = len(phoneme_encoder.classes_)

In [20]:
encoded_words = [word_encoder.transform(list(word)) for word in words]

In [23]:
encoded_phonemes = encode_phonemes(phonemes, phoneme_encoder)


In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(max(len(seq) for seq in encoded_words), max(len(seq) for seq in encoded_phonemes))


In [30]:
X_padded = pad_sequences(encoded_words, maxlen=max_len, padding='post')
y_padded = pad_sequences(encoded_phonemes, maxlen=max_len, padding='post')

In [141]:
import tensorflow as tf

def masked_accuracy(y_true, y_pred):
   # Konvertiere Vorhersagen in Klassen
    y_pred_class = tf.argmax(y_pred, axis=-1)
    
    # Entferne die letzte Dimension von y_true
    y_true = tf.squeeze(y_true, -1)
    
    # Maskiere die gepolsterten Werte (Annahmen: 0 ist der gepolsterte Wert)
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    
    # Korrekte Vorhersagen
    matches = tf.cast(tf.equal(y_true, tf.cast(y_pred_class, y_true.dtype)), dtype=tf.float32)
    
    # Anzahl der korrekten Vorhersagen (ohne Padding)
    masked_matches = tf.reduce_sum(matches * mask)
    
    # Anzahl der gültigen Datenpunkte (ohne Padding)
    masked_count = tf.reduce_sum(mask)
    
    # Berechnung der Genauigkeit
    return masked_matches / masked_count

In [146]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dot, Activation
from tensorflow.keras.layers import Attention

input_seq = Input(shape=(max_len,))
embedded_seq = Embedding(input_dim=num_chars, output_dim=64)(input_seq)
lstm_seq = Bidirectional(LSTM(512, return_sequences=True))(embedded_seq)

# Berechnung der Attention-Gewichte
attention_scores = Dot(axes=[2, 2])([lstm_seq, lstm_seq])
attention_weights = Activation('softmax')(attention_scores)

# Apply the attention weights to the LSTM output
context_vector = Dot(axes=[2, 1])([attention_weights, lstm_seq])

output_seq = TimeDistributed(Dense(num_phonemes, activation='softmax'))(context_vector)

model = Model(inputs=input_seq, outputs=output_seq)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[masked_accuracy])
model.summary()


In [147]:
import numpy as np

X_paddeds = np.array(X_padded)
y_paddeds = np.expand_dims(np.array(y_padded), -1)

model.fit(X_paddeds, y_paddeds, epochs=4, batch_size=32, validation_split=0.2)


Epoch 1/4
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m396s[0m 118ms/step - loss: 0.4769 - masked_accuracy: 0.3700 - val_loss: 0.1747 - val_masked_accuracy: 0.7394
Epoch 2/4
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 117ms/step - loss: 0.1350 - masked_accuracy: 0.8024 - val_loss: 0.1404 - val_masked_accuracy: 0.8013
Epoch 3/4
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m395s[0m 118ms/step - loss: 0.1022 - masked_accuracy: 0.8481 - val_loss: 0.1295 - val_masked_accuracy: 0.8139
Epoch 4/4
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 118ms/step - loss: 0.0871 - masked_accuracy: 0.8693 - val_loss: 0.1284 - val_masked_accuracy: 0.8157


<keras.src.callbacks.history.History at 0x1f8c9e42e30>

In [148]:
# Vorhersage und Attention-Gewichte extrahieren
attention_extractor = Model(inputs=model.input, outputs=[output_seq, attention_weights])


In [158]:
word = ["ITWORKS"]

query = np.array(pad_sequences([word_encoder.transform(list(word[0]))], maxlen=max_len, padding='post'))
attention_weights = attention_layer_model.predict(query)

# Vorhersage der Phoneme
prediction = model.predict(query)
predicted_phonemes = np.argmax(prediction, axis=-1)
phoneme_res = phoneme_encoder.inverse_transform(predicted_phonemes[0])

# Analyse der Attention-Gewichte und Vorhersagen
for i, (word, pred) in enumerate(zip(word, prediction)):
    print(f"Word: {word}")
    for j, phoneme_id in enumerate(np.argmax(pred, axis=-1)):
        if phoneme_id != 0:
            print(f"  Predicted Phoneme:  {phoneme_encoder.inverse_transform([phoneme_id])}")
            print("  Attention Weights:")
            for k, weight in enumerate(attention_weights[0][j]):
                if k < len(word):
                    print(f"    Grapheme: {word[k]} -> Weight: {weight:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Word: ITWORKS
  Predicted Phoneme:  ['IH']
  Attention Weights:
    Grapheme: I -> Weight: 0.9997
    Grapheme: T -> Weight: 0.0003
    Grapheme: W -> Weight: 0.0000
    Grapheme: O -> Weight: 0.0000
    Grapheme: R -> Weight: 0.0000
    Grapheme: K -> Weight: 0.0000
    Grapheme: S -> Weight: 0.0000
  Predicted Phoneme:  ['T']
  Attention Weights:
    Grapheme: I -> Weight: 0.0000
    Grapheme: T -> Weight: 1.0000
    Grapheme: W -> Weight: 0.0000
    Grapheme: O -> Weight: 0.0000
    Grapheme: R -> Weight: 0.0000
    Grapheme: K -> Weight: 0.0000
    Grapheme: S -> Weight: 0.0000
  Predicted Phoneme:  ['W']
  Attention Weights:
    Grapheme: I -> Weight: 0.0000
    Grapheme: T -> Weight: 0.0000
    Grapheme: W -> Weight: 1.0000
    Grapheme: O -> Weight: 0.0000
    Grapheme: R -> Weight: 0.0000
    Grapheme: K -> Weight: 0.0000
    Grapheme: 

In [None]:

# Konvertiere die Vorhersagen zurück zu Phonemen
id_to_phoneme = {idx: phoneme for phoneme, idx in phoneme_to_id.items()}

# Analyse der Attention-Gewichte und Vorhersagen
for i, (word, pred) in enumerate(zip(words, predictions)):
    print(f"Word: {word}")
    for j, phoneme_id in enumerate(np.argmax(pred, axis=-1)):
        if phoneme_id != 0:
            print(f"  Predicted Phoneme: {id_to_phoneme[phoneme_id]}")
            print("  Attention Weights:")
            for k, weight in enumerate(attention_weights[i][j]):
                if k < len(word):
                    print(f"    Grapheme: {word[k]} -> Weight: {weight:.4f}")


In [106]:
predicted_phonemes[predicted_phonemes != 0]

array([37,  5, 14, 22, 22, 16], dtype=int64)

In [159]:
# Modell speichern
model.save('word_to_phoneme_model_ATTENTION_10MB.keras')
