In [1]:
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import re
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Activation, Bidirectional
from keras.layers import Masking


In [2]:
# Datei lesen
file_path = './cmudict.txt'

# Listen für Wörter und Phoneme
words = []
phonemes = []

with open(file_path, 'r') as file:
    for line in file:
        # Trennen des Wortes und der Phoneme beim ersten Leerzeichen
        word, phoneme_string = line.strip().split(' ', 1)
        word = re.sub(r'\(.*?\)', '', word)
        words.append(word)
        phonemes.append(phoneme_string[1:])

 

In [3]:
# Tokenisierung der Wörter und Phoneme
word_encoder = LabelEncoder()
phoneme_encoder = LabelEncoder()

In [4]:
all_chars = list(set(''.join(words)))
all_phonemes = list(set(' '.join(phonemes).split()))

In [9]:
word_encoder.fit(all_chars)
phoneme_encoder.fit(all_phonemes)

encoded_words = [word_encoder.transform(list(word)) for word in words]
encoded_phonemes = [phoneme_encoder.transform(phoneme.split()) for phoneme in phonemes]

In [10]:
# Anzahl der verschiedenen Zeichen und Phoneme
num_chars = len(word_encoder.classes_)
num_phonemes = len(phoneme_encoder.classes_)

In [11]:
# Padding der Sequenzen
max_word_length = max([len(word) for word in encoded_words])
max_phoneme_length = max([len(phoneme) for phoneme in encoded_phonemes])

In [12]:
# Padding der Sequenzen
max_length = max(max_word_length, max_phoneme_length)


In [13]:
# Woerter mit Padding versehen
padded_words = pad_sequences(encoded_words, maxlen=max_length, padding='post')
padded_phonemes = pad_sequences(encoded_phonemes, maxlen=max_length, padding='post')

In [32]:
# Modellaufbau mit Masking Layer
model = Sequential()
model.add(Embedding(input_dim=len(word_encoder.classes_), output_dim=64, input_length=max_word_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(TimeDistributed(Dense(len(phoneme_encoder.classes_))))
model.add(Activation('softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=["accuracy"])

In [33]:
# Umwandeln der Labels für das Training aka verschachteln der werte in einzelne arrays
y = np.expand_dims(padded_phonemes, -1)

In [34]:
model.fit(padded_words, y, epochs=3, batch_size=32, validation_split=0.2)


Epoch 1/3
[1m 199/3358[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:06[0m 21ms/step - accuracy: 0.7930 - loss: 1.2054

KeyboardInterrupt: 

In [52]:
# Modell speichern
model.save('word_to_phoneme_model2.keras')


In [74]:
def predict_phonemes(word):
    word = re.sub(r'\(.*?\)', '', word)
    encoded_word = word_encoder.transform(list(word))
    padded_word = pad_sequences([encoded_word], max_length, padding="post")
    prediction = model.predict(padded_word)
    predicted_phonemes = np.argmax(prediction, axis=-1)
    return phoneme_encoder.inverse_transform(predicted_phonemes[0])

In [106]:
new_word = "MASSACHUSETTS"
predicted_phonemes = predict_phonemes(new_word)
print(" ".join(predicted_phonemes))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
M AE2 S AH0 CH UW1 S AH0 T AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0 AA0
