In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
(x_train, _), (_, _) = tf.keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
# Convert integer sequences back to text
word_index = tf.keras.datasets.imdb.get_word_index()
index_word = {idx: word for word, idx in word_index.items()}
imdb_reviews = [' '.join([index_word.get(idx - 3, '?') for idx in sequence]) for sequence in x_train]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [4]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(imdb_reviews)
vocab_size = len(tokenizer.word_index) + 1

In [5]:
# Generate input-output pairs
sequences = tokenizer.texts_to_sequences(imdb_reviews)
max_seq_length = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='pre')
X = sequences[:, :-1]
y = tf.keras.utils.to_categorical(sequences[:, -1], num_classes=vocab_size)

In [6]:
# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256, input_length=max_seq_length-1),
    LSTM(units=128),
    Dense(vocab_size, activation='softmax')
])

In [8]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [9]:
# Define data generator
def data_generator(X, y, batch_size):
    while True:
        for i in range(0, len(X), batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
            yield X_batch, y_batch

In [10]:
# Train model using data generator
batch_size = 32
steps_per_epoch = len(X_train) // batch_size
validation_steps = len(X_val) // batch_size

train_generator = data_generator(X_train, y_train, batch_size)
val_generator = data_generator(X_val, y_val, batch_size)

model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=10, validation_data=val_generator, validation_steps=validation_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7d809f878670>

In [11]:

# Generate text
seed_text = "The sun"
for _ in range(100):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=max_seq_length-1, truncating='pre')
    y_pred_probs = model.predict(encoded)[0]
    next_word_index = np.random.choice(len(y_pred_probs), p=y_pred_probs)
    next_word = tokenizer.index_word[next_word_index]
    seed_text += " " + next_word
print(seed_text)

The sun stinks that gem good movie worse junk worse other today cable enjoy it favourite up d warned america audience good airing airing career immediately ii scripts stifler falk net ii ii order films rhonda bother shows note empathy known about people enjoy it pathetic suspense fans feel fights art film together audiences film making it made this film please recommend it off portrayed substance up kiss list please 2005 bad later it else do more beautiful list should better pan justice deliver better swing people go minutes made it so much much see it yet again again once see it
