In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, LSTM, RepeatVector, TimeDistributed, Dense
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Load the data
data = pd.read_csv('orthodata.csv')

words_column = 'words'
phonetic_column = 'IPA'

# Convert NaN values to empty strings
data[words_column] = data[words_column].astype(str)
data[phonetic_column] = data[phonetic_column].astype(str)

# Extract English words and split them into characters
words = data[words_column]
phonetic_forms = data[phonetic_column]

# Join all characters to build a vocabulary
all_characters = sorted(set(' '.join(words) + ' '.join(phonetic_forms)))

# Use Keras Tokenizer to encode characters
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(all_characters)

# Convert words and phonetic forms to integer sequences
X_phonetic_sequences = tokenizer.texts_to_sequences(phonetic_forms)
Y_orthographic_sequences = tokenizer.texts_to_sequences(words)

# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in X_phonetic_sequences + Y_orthographic_sequences)
X_phonetic_padded = pad_sequences(X_phonetic_sequences, maxlen=max_sequence_length, padding='post')
Y_orthographic_padded = pad_sequences(Y_orthographic_sequences, maxlen=max_sequence_length, padding='post')

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_phonetic_padded, Y_orthographic_padded, test_size=0.2, random_state=42)

# Build the seq2seq autoencoder model with increased complexity
latent_dim = 256  # Increased latent dimension

encoder_inputs = Input(shape=(max_sequence_length,))
encoder_embedding = Embedding(len(tokenizer.word_index) + 1, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(encoder_embedding)
encoder_lstm = LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2)(encoder_lstm)

decoder_inputs = Input(shape=(max_sequence_length,))
decoder_embedding = Embedding(len(tokenizer.word_index) + 1, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(decoder_embedding)
decoder_lstm = LSTM(latent_dim, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(decoder_lstm)
decoder_dense = TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax'))(decoder_lstm)
autoencoder = Model([encoder_inputs, decoder_inputs], decoder_dense)


autoencoder.compile(optimizer=Adam(lr=0.001), loss='sparse_categorical_crossentropy')

# Train the model
autoencoder.fit([X_train, X_train], Y_train, epochs=20, batch_size=1024, validation_data=([X_test, X_test], Y_test))

# Evaluate the model
accuracy = autoencoder.evaluate([X_test, X_test], Y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')


# Generate five random words using the encoder-decoder model
for _ in range(5):
    random_input_vector = X_test[np.random.randint(0, X_test.shape[0])]
    random_input_vector = np.expand_dims(random_input_vector, axis=0)  # Add batch dimension
    decoded_word = autoencoder.predict([random_input_vector, random_input_vector])

    # Convert the decoded word to characters
    decoded_word_chars = [tokenizer.index_word[i] for i in np.argmax(decoded_word, axis=2)[0] if i != 0]

    print('Generated Word:', ''.join(decoded_word_chars))




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 39.14%
Generated Word: wagg
Generated Word: contentiddl
Generated Word: pallddin
Generated Word: acuitt
Generated Word: hartbrooonn


In [None]:

# Generate five random words using the encoder-decoder model
original_words = []
converted_words = []

for _ in range(5):
    random_index = np.random.randint(0, X_test.shape[0])
    random_input_vector = X_test[random_index]
    random_input_vector = np.expand_dims(random_input_vector, axis=0)  # Add batch dimension
    decoded_word = autoencoder.predict([random_input_vector, random_input_vector])

    # Convert the decoded word to characters
    decoded_word_chars = [tokenizer.index_word[i] for i in np.argmax(decoded_word, axis=2)[0] if i != 0]

    original_word = ' '.join([tokenizer.index_word[i] for i in X_test[random_index] if i != 0])
    converted_word = ''.join(decoded_word_chars)

    original_words.append(original_word)
    converted_words.append(converted_word)

    print(f'Original Word: {original_word} | Converted Word: {converted_word}')

# Print the original and converted words lists
print('Original Words:', original_words)
print('Converted Words:', converted_words)


Original Word: l e i t @ n t | Converted Word: leiernt
Original Word: t r o l i b v s | Converted Word: trollbbus
Original Word: p l e i n | Converted Word: plain
Original Word: p e i n t @ r ɞ | Converted Word: peinter
Original Word: l o n h ɑ n d | Converted Word: longandd
Original Words: ['l e i t @ n t', 't r o l i b v s', 'p l e i n', 'p e i n t @ r ɞ', 'l o n h ɑ n d']
Converted Words: ['leiernt', 'trollbbus', 'plain', 'peinter', 'longandd']
