In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [3]:
# Example text sequences
texts = [
    "I love deep learning",
    "Autoencoders are amazing",
    "Text generation is fun",
    "Neural networks are powerful"
]

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Get vocabulary size
vocab_size = len(word_index) + 1

# Pad sequences to have the same length
max_sequence_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='post')


In [4]:
# Hyperparameters
embedding_dim = 50  # Size of word embeddings
latent_dim = 100    # Size of the latent space (output of encoder)

# Encoder
encoder_inputs = Input(shape=(max_sequence_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_len)
embedded_inputs = embedding_layer(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=False, return_state=True)
_, state_h, state_c = encoder_lstm(embedded_inputs)

# Encoder output: the latent representation (state_h and state_c)

# Decoder
decoder_inputs = Input(shape=(max_sequence_len,))
decoder_embedding = embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

# Output layer: Predict the next word in the sequence
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Build the autoencoder model
autoencoder = Model([encoder_inputs, decoder_inputs], decoder_outputs)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

autoencoder.summary()




In [5]:
# Create input-output pairs
X = padded_sequences
y = np.expand_dims(padded_sequences, -1)  # Target sequences (same as inputs)

# Train the autoencoder
history = autoencoder.fit([X, X], y, batch_size=16, epochs=100, validation_split=0.2)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - loss: 2.7053 - val_loss: 2.7112
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - loss: 2.6991 - val_loss: 2.7148
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - loss: 2.6928 - val_loss: 2.7185
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - loss: 2.6863 - val_loss: 2.7223
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 2.6796 - val_loss: 2.7263
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 2.6724 - val_loss: 2.7304
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - loss: 2.6649 - val_loss: 2.7348
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 2.6567 - val_loss: 2.7394
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [6]:
def generate_text(seed_text, num_words, max_sequence_len, model, tokenizer):
    for _ in range(num_words):
        # Tokenize the seed text
        sequence = tokenizer.texts_to_sequences([seed_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_sequence_len, padding='post')
        
        # Predict the next word
        predicted = model.predict([sequence, sequence], verbose=0)
        predicted_word_idx = np.argmax(predicted[0][-1])

        # Convert predicted index to word
        word = tokenizer.index_word.get(predicted_word_idx)
        if word is None:
            break
        seed_text += ' ' + word
    return seed_text

# Generate a text sequence
seed_text = "I love"
generated_text = generate_text(seed_text, num_words=5, max_sequence_len=max_sequence_len, model=autoencoder, tokenizer=tokenizer)
print("Generated Text:", generated_text)


Generated Text: I love deep learning learning learning learning
