In [1]:
# pip install tensorflow

In [2]:
# pip install keras-preprocessing


In [3]:
# pip show tensorflow keras keras-preprocessing


In [4]:
# pip install tensorflow keras --upgrade


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

# List of phonemes - using a subset for this example; expand as needed
phonemes_list = [
    "P", "B", "T", "D", "K", "G", "CH", "JH", "F", "V", "TH", "DH", "S", "Z", "SH", "ZH",
    "HH", "M", "N", "NG", "L", "R", "Y", "W", "IY", "IH", "EH", "AE", "AA", "AH", "AO", "UH",
    "UW", "ER", "OW", "AW", "AY", "EY", "OY"
]

# Create potential mappings from phonemes to graphemes
phoneme_to_grapheme_map = {
    "P": ["p", "ph"], "B": ["b"], "T": ["t"], "D": ["d"], "K": ["k", "c"], "G": ["g"],
    "CH": ["ch"], "JH": ["j", "g"], "F": ["f", "ph", "gh"], "V": ["v"], "TH": ["th"], "DH": ["dh"],
    "S": ["s", "c"], "Z": ["z", "s"], "SH": ["sh"], "ZH": ["zh", "s", "z"], "HH": ["h"], "M": ["m"],
    "N": ["n"], "NG": ["ng"], "L": ["l"], "R": ["r"], "Y": ["y", "i"], "W": ["w"], 
    "IY": ["ee", "e", "i"], "IH": ["i", "y"], "EH": ["e"], "AE": ["a", "æ"], 
    "AA": ["a", "ah"], "AH": ["uh", "a", "o"], "AO": ["aw", "a", "o"], 
    "UH": ["uh", "u"], "UW": ["oo", "u", "ew"], "ER": ["er", "ur", "ir"], 
    "OW": ["ow", "o"], "AW": ["aw", "au"], "AY": ["ai", "ay", "i", "ey"], 
    "EY": ["ay", "ei", "a"], "OY": ["oi", "oy"],
    "AAH": ["ah"], "AEH": ["æ"], "OE": ["oe", "eu"], "E": ["e", "ea"], 
    "AI": ["ai"], "OU": ["ou", "ow"], "UA": ["ua"], "IA": ["ia"]
}


# Function to create randomized grapheme sequence given a phoneme sequence
def generate_grapheme_sequence(phoneme_seq):
    grapheme_sequence = [random.choice(phoneme_to_grapheme_map.get(phoneme, [phoneme.lower()])) for phoneme in phoneme_seq]
    return grapheme_sequence

# Tokenizer for phonemes
tokenizer_phoneme = Tokenizer(char_level=True)
tokenizer_phoneme.fit_on_texts(phonemes_list)

# Tokenizer for graphemes
tokenizer_grapheme = Tokenizer(char_level=True)

# Function to create tokenized sequences
def create_sequences(input_phoneme_seq):
    # Generate grapheme sequence based on phoneme sequence
    grapheme_output_seq = generate_grapheme_sequence(input_phoneme_seq)

    # Tokenize sequences
    phoneme_input_seq = tokenizer_phoneme.texts_to_sequences([input_phoneme_seq])[0]
    tokenizer_grapheme.fit_on_texts([grapheme_output_seq])
    grapheme_output_seq = tokenizer_grapheme.texts_to_sequences([grapheme_output_seq])[0]

    # Pad sequences to equal length
    max_sequence_length = max(len(phoneme_input_seq), len(grapheme_output_seq))
    phoneme_input_seq = pad_sequences([phoneme_input_seq], maxlen=max_sequence_length, padding='post')
    grapheme_output_seq = pad_sequences([grapheme_output_seq], maxlen=max_sequence_length, padding='post')
    
    return phoneme_input_seq, grapheme_output_seq, max_sequence_length

# Example input (phoneme sequence) for "Please do not touch"
input_phoneme_seq = ["P", "L", "IY", "Z", "D", "UW", "N", "AA", "T", "T", "AH", "CH"]

# Create sequences for training
phoneme_input_seq, grapheme_output_seq, max_sequence_length = create_sequences(input_phoneme_seq)

# Create a model with explicit input length
vocab_size_phonemes = len(tokenizer_phoneme.word_index) + 1
vocab_size_graphemes = len(tokenizer_grapheme.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size_phonemes, output_dim=10, input_length=max_sequence_length))  # Added input_length
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size_graphemes, activation='softmax')))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Reshape target data for model
grapheme_output_seq = grapheme_output_seq.reshape((1, max_sequence_length, 1))

# Train the model
model.fit(phoneme_input_seq, grapheme_output_seq, epochs=250, batch_size=1)

def generate_gibberish(phoneme_seq):
    # Tokenize and pad the phoneme sequence to ensure consistent shape
    phoneme_input_seq = tokenizer_phoneme.texts_to_sequences([phoneme_seq])[0]
    phoneme_input_seq = pad_sequences([phoneme_input_seq], maxlen=max_sequence_length, padding='post')
    
    # Predict gibberish
    predicted_gibberish_indices = model.predict(phoneme_input_seq)
    predicted_gibberish_indices = np.argmax(predicted_gibberish_indices, axis=-1)

    # Convert indices to graphemes
    predicted_gibberish = [tokenizer_grapheme.index_word.get(index, '') for index in predicted_gibberish_indices[0]]
    
    return ''.join(predicted_gibberish)

# Example usage to generate gibberish from phoneme sequence
print("Generated Gibberish:", generate_gibberish(input_phoneme_seq))




Epoch 1/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step - accuracy: 0.0833 - loss: 2.4834
Epoch 2/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step - accuracy: 0.1667 - loss: 2.4801
Epoch 3/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.1667 - loss: 2.4766
Epoch 4/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.1667 - loss: 2.4729
Epoch 5/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.1667 - loss: 2.4688
Epoch 6/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - accuracy: 0.1667 - loss: 2.4640
Epoch 7/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.1667 - loss: 2.4585
Epoch 8/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.1667 - loss: 2.4519
Epoch 9/250
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

In [13]:
print("Generated Gibberish:", generate_gibberish(input_phoneme_seq))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
Generated Gibberish: lleezdoonattuhch
