In [64]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
from keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Activation, Dropout
from keras.layers import  Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import tensorflow as tf

In [74]:
# Datei lesen
file_path = './cmudict.txt'

# Listen für Wörter und Phoneme
words = []
phonemes = []

with open(file_path, 'r') as file:
    for line in file:
        # Trennen des Wortes und der Phoneme beim ersten Leerzeichen
        word, phoneme_string = line.strip().split(' ', 1)
        word = re.sub(r'\(.*?\)', '', word)
        
        # Überprüfen, ob das Wort unerwünschte Zeichen enthält
        if re.search(r'[\d_\-ÀÉ]', word):
            continue
        
        phoneme_list = phoneme_string.split()
        # Entfernen der Betonungszahlen aus den Phonemen
        cleaned_phonemes = [re.sub(r'\d', '', phoneme) for phoneme in phoneme_list]
        words.append(word)
        phonemes.append(' '.join(cleaned_phonemes))

 

In [75]:
# Tokenisierung der Wörter und Phoneme
word_encoder = LabelEncoder()
phoneme_encoder = LabelEncoder()

In [76]:
all_chars = list(set(''.join(words)))
all_phonemes = list(set(' '.join(phonemes).split()))

In [77]:
all_chars

['K',
 'Y',
 'W',
 'O',
 'T',
 'Q',
 'J',
 'H',
 'S',
 'Z',
 'U',
 "'",
 'A',
 'G',
 'P',
 'R',
 'X',
 '.',
 'B',
 'F',
 'V',
 'L',
 'N',
 'E',
 'D',
 'M',
 'C',
 'I']

In [78]:
word_encoder.fit(all_chars)
phoneme_encoder.fit(all_phonemes)

encoded_words = [word_encoder.transform(list(word)) for word in words]
encoded_phonemes = [phoneme_encoder.transform(phoneme.split()) for phoneme in phonemes]

In [95]:
# Anzahl der verschiedenen Zeichen und Phoneme
num_chars = len(word_encoder.classes_)
num_phonemes = len(phoneme_encoder.classes_)

In [96]:
# Padding der Sequenzen
max_word_length = max(len(word) for word in encoded_words)
max_phoneme_length = max(len(phoneme) for phoneme in encoded_phonemes)

In [97]:
# Padding der Sequenzen
max_length = max(max_word_length, max_phoneme_length)


In [98]:
# Woerter mit Padding versehen
padded_words = pad_sequences(encoded_words, maxlen=max_length, padding='post')
padded_phonemes = pad_sequences(encoded_phonemes, maxlen=max_length, padding='post')

In [99]:
# Modellaufbau
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(word_encoder.classes_), output_dim=64))

    # Erste LSTM-Schicht mit Dropout
    model.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
    
    # Zweite LSTM-Schicht mit Dropout
    model.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
    
    # Dense-Schicht mit L2-Regularisierung
    model.add(TimeDistributed(Dense(128, activation='relu', kernel_regularizer=l2(0.01))))
    
 
    # Ausgabe-Schicht
    model.add(TimeDistributed(Dense(len(phoneme_encoder.classes_))))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=[masked_accuracy])
    
    return model

model = create_model()

# Modellzusammenfassung
model.summary()

In [100]:
# Umwandeln der Labels für das Training aka verschachteln der werte in einzelne arrays
y = np.expand_dims(padded_phonemes, -1)

In [45]:
import tensorflow as tf
# Eager Execution aktivieren
tf.config.run_functions_eagerly(False)


In [17]:
def masked_accuracy(y_true, y_pred):
   # Konvertiere Vorhersagen in Klassen
    y_pred_class = tf.argmax(y_pred, axis=-1)
    
    # Entferne die letzte Dimension von y_true
    y_true = tf.squeeze(y_true, -1)
    
    # Maskiere die gepolsterten Werte (Annahmen: 0 ist der gepolsterte Wert)
    mask = tf.cast(tf.not_equal(y_true, 0), dtype=tf.float32)
    
    # Korrekte Vorhersagen
    matches = tf.cast(tf.equal(y_true, tf.cast(y_pred_class, y_true.dtype)), dtype=tf.float32)
    
    # Anzahl der korrekten Vorhersagen (ohne Padding)
    masked_matches = tf.reduce_sum(matches * mask)
    
    # Anzahl der gültigen Datenpunkte (ohne Padding)
    masked_count = tf.reduce_sum(mask)
    
    # Berechnung der Genauigkeit
    return masked_matches / masked_count

In [101]:
model.fit(padded_words, y, epochs=5, batch_size=32, validation_split=0.2)



Epoch 1/5
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 114ms/step - loss: 0.6761 - masked_accuracy: 0.2766 - val_loss: 0.2388 - val_masked_accuracy: 0.6946
Epoch 2/5
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 114ms/step - loss: 0.1950 - masked_accuracy: 0.7595 - val_loss: 0.1767 - val_masked_accuracy: 0.7924
Epoch 3/5
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 117ms/step - loss: 0.1502 - masked_accuracy: 0.8155 - val_loss: 0.1566 - val_masked_accuracy: 0.8089
Epoch 4/5
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m391s[0m 117ms/step - loss: 0.1314 - masked_accuracy: 0.8390 - val_loss: 0.1519 - val_masked_accuracy: 0.8224
Epoch 5/5
[1m3331/3331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 118ms/step - loss: 0.1215 - masked_accuracy: 0.8507 - val_loss: 0.1447 - val_masked_accuracy: 0.8242


<keras.src.callbacks.history.History at 0x2be51eeb8e0>

In [90]:
# Modell speichern
model.save('word_to_phoneme_model_nostress_new.keras')



  return saving_lib.save_model(model, filepath)


In [33]:
def predict_phonemes(word):
    word = re.sub(r'\(.*?\)', '', word)
    encoded_word = word_encoder.transform(list(word))
    padded_word = pad_sequences([encoded_word], max_length, padding="post")
    prediction = model.predict(padded_word)
    predicted_phonemes = np.argmax(prediction, axis=-1)
    return phoneme_encoder.inverse_transform(predicted_phonemes[0])

In [88]:
new_word = "ZEIGARNIK"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))
new_word = "MASSACHUSETTS"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))
new_word = "SEBASTIAN"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))
new_word = "FML"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))
new_word = "FUCK"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))
new_word = "EASYPIEZY"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step
Z AY G ER AA N IH AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
M AE S AH CH UW S IH Z S AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
S EY B AE S CH AH N AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
F AH L L IY EH AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
F AH K AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
IY Z IH P IY AH IY AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA A

In [31]:
model = tf.keras.models.load_model("word_to_phoneme_model_masking.keras")