In [1]:
import re

words = []
phonemes = []

# Datei lesen
file_path = './cmudict.txt'

with open(file_path, 'r') as file:
    for line in file:
        # Trennen des Wortes und der Phoneme beim ersten Leerzeichen
        word, phoneme_string = line.strip().split(' ', 1)
        word = re.sub(r'\(.*?\)', '', word)
        
        # Überprüfen, ob das Wort unerwünschte Zeichen enthält
        if re.search(r'[\d_\-ÀÉ]', word):
            continue
        
        phoneme_list = phoneme_string.split()
        # Entfernen der Betonungszahlen aus den Phonemen
        cleaned_phonemes = [re.sub(r'\d', '', phoneme) for phoneme in phoneme_list]
        words.append(word)
        phonemes.append(' '.join(cleaned_phonemes).split(' '))



In [2]:
all_chars = list(set(''.join(words)))
all_phonemes = list(set(' '.join([' '.join(phoneme) for phoneme in phonemes]).split()))


In [3]:
def encode_phonemes(phoneme_list, encoder):
    # Encode each list of phonemes
    encoded_phonemes = [encoder.transform(phoneme) for phoneme in phoneme_list]
    return encoded_phonemes

In [4]:
from sklearn.preprocessing import LabelEncoder

# Tokenisierung der Wörter und Phoneme
word_encoder = LabelEncoder()
phoneme_encoder = LabelEncoder()

word_encoder.fit(all_chars)
phoneme_encoder.fit(all_phonemes)

# Anzahl der verschiedenen Zeichen und Phoneme
num_chars = len(word_encoder.classes_)
num_phonemes = len(phoneme_encoder.classes_)

In [5]:
encoded_words = [word_encoder.transform(list(word)) for word in words]

In [6]:
encoded_phonemes = encode_phonemes(phonemes, phoneme_encoder)


In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(max(len(seq) for seq in encoded_words), max(len(seq) for seq in encoded_phonemes))

X_padded = pad_sequences(encoded_words, maxlen=max_len, padding='post')
y_padded = pad_sequences(encoded_phonemes, maxlen=max_len, padding='post')

In [8]:
import tensorflow as tf

def masked_accuracy(y_true, y_pred):
   # Konvertiere Vorhersagen in Klassen
    y_pred_class = tf.argmax(y_pred, axis=-1)
    
    # Entferne die letzte Dimension von y_true
    y_true = tf.squeeze(y_true, -1)
    
    # Maskiere die gepolsterten Werte (Annahmen: 0 ist der gepolsterte Wert)
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    
    # Korrekte Vorhersagen
    matches = tf.cast(tf.equal(y_true, tf.cast(y_pred_class, y_true.dtype)), dtype=tf.float32)
    
    # Anzahl der korrekten Vorhersagen (ohne Padding)
    masked_matches = tf.reduce_sum(matches * mask)
    
    # Anzahl der gültigen Datenpunkte (ohne Padding)
    masked_count = tf.reduce_sum(mask)
    
    # Berechnung der Genauigkeit
    return masked_matches / masked_count

In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dot, Activation
from tensorflow.keras.layers import Attention

input_seq = Input(shape=(max_len,))
embedded_seq = Embedding(input_dim=num_chars, output_dim=64)(input_seq)
lstm_seq = Bidirectional(LSTM(256*2, return_sequences=True))(embedded_seq)

# Berechnung der Attention-Gewichte
attention_scores = Dot(axes=[2, 2])([lstm_seq, lstm_seq])
attention_weights = Activation('softmax')(attention_scores)

# Apply the attention weights to the LSTM output
context_vector = Dot(axes=[2, 1])([attention_weights, lstm_seq])

output_seq = TimeDistributed(Dense(num_phonemes, activation='softmax'))(context_vector)

model = Model(inputs=input_seq, outputs=output_seq)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=[masked_accuracy])
model.summary()


In [10]:
import numpy as np

X_paddeds = np.array(X_padded)
y_paddeds = np.expand_dims(np.array(y_padded), -1)

model.fit(X_paddeds, y_paddeds, epochs=2, batch_size=32, validation_split=0.2)

Epoch 1/2
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 46ms/step - loss: 0.4987 - masked_accuracy: 0.3482 - val_loss: 0.1887 - val_masked_accuracy: 0.7120
Epoch 2/2
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 47ms/step - loss: 0.1481 - masked_accuracy: 0.7823 - val_loss: 0.1451 - val_masked_accuracy: 0.7852


<keras.src.callbacks.history.History at 0x2763a2dd6f0>

In [11]:
# Vorhersage und Attention-Gewichte extrahieren
attention_extractor = Model(inputs=model.input, outputs=[output_seq, attention_weights])


In [22]:
test_phrase="once"

test_phrase=test_phrase.upper()
words = test_phrase.split(" ")

for word in words:
    print(word)
    word=[word]
    query = np.array(pad_sequences([word_encoder.transform(list(word[0]))], maxlen=max_len, padding='post'))
    #attention_weights = attention_extractor.predict(query)
    predictions, attention_weights = attention_extractor.predict(query)
    
    # Vorhersage der Phoneme
    prediction = model.predict(query)
    predicted_phonemes = np.argmax(prediction, axis=-1)
    phoneme_res = phoneme_encoder.inverse_transform(predicted_phonemes[0])
    
    # Analyse der Attention-Gewichte und Vorhersagen
    for i, (word, pred) in enumerate(zip(word, prediction)):
        print(f"Word: {word}")
        for j, phoneme_id in enumerate(np.argmax(pred, axis=-1)):
            if phoneme_id != 0:
                print(f"  Predicted Phoneme:  {phoneme_encoder.inverse_transform([phoneme_id])}")
                print("  Attention Weights:")
                for k, weight in enumerate(attention_weights[0][j]):
                    if k < len(word):
                        print(f"    Grapheme: {word[k]} -> Weight: {weight:.4f}")

ONCE
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Word: ONCE
  Predicted Phoneme:  ['N']
  Attention Weights:
    Grapheme: O -> Weight: 0.0000
    Grapheme: N -> Weight: 0.9999
    Grapheme: C -> Weight: 0.0001
    Grapheme: E -> Weight: 0.0000
  Predicted Phoneme:  ['S']
  Attention Weights:
    Grapheme: O -> Weight: 0.0000
    Grapheme: N -> Weight: 0.0000
    Grapheme: C -> Weight: 1.0000
    Grapheme: E -> Weight: 0.0000


In [159]:
# Modell speichern
model.save('word_to_phoneme_model_ATTENTION_10MB.keras')


In [9]:
import tensorflow as tf
from keras.layers import InputLayer, Embedding, LSTM, Bidirectional, Dot, Activation, TimeDistributed, Dense
from keras.optimizers import Adam


custom_objects = {'AttentionLayer': AttentionLayer}


model = tf.keras.models.load_model("word_to_phoneme_model_ATTENTION_10MB.keras", custom_objects=custom_objects)

TypeError: Error when deserializing class 'InputLayer' using config={'batch_shape': [None, 34], 'dtype': 'float32', 'sparse': False, 'name': 'input_layer_11'}.

Exception encountered: Unrecognized keyword arguments: ['batch_shape']