In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

# Step 1: Load the IMDb dataset (top 10,000 frequent words)
vocab_size = 10000
max_len = 200  # maximum length of review sequences

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)

# Pad sequences to the same length
x_train = pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')

# Step 2: Build the GRU model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    GRU(64, return_sequences=False),  # 64 GRU units
    Dense(1, activation='sigmoid')  # binary classification output
])

# Step 3: Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Show the model summary
model.summary()

# Step 4: Train the model
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

# Step 5: Evaluate the model on test data
test_loss, test_acc = model.evaluate(x_test, y_test)
print(f'Test Accuracy: {test_acc:.4f}')


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 3us/step




Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 71ms/step - accuracy: 0.5141 - loss: 0.6933 - val_accuracy: 0.5192 - val_loss: 0.6903
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 72ms/step - accuracy: 0.5743 - loss: 0.6727 - val_accuracy: 0.6028 - val_loss: 0.6470
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.7265 - loss: 0.5532 - val_accuracy: 0.8232 - val_loss: 0.4249
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 90ms/step - accuracy: 0.8752 - loss: 0.3169 - val_accuracy: 0.8482 - val_loss: 0.3655
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 84ms/step - accuracy: 0.9275 - loss: 0.2041 - val_accuracy: 0.8648 - val_loss: 0.3704
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.8484 - loss: 0.3949
Test Accuracy: 0.8476


In [3]:
import numpy as np
import tokenizers
# Assume you have these from your training pipeline:
# tokenizer: with methods word_index (word->int) and index_word (int->word)
# model: your trained GRU model for next word prediction
# max_seq_len: max length input the model expects

def generate_text(seed_text, next_words=20, max_seq_len=20):
    output_text = seed_text
    
    for _ in range(next_words):
        # Tokenize and pad the input text
        token_list = [tokenizer.word_index.get(word, 0) for word in output_text.split()]
        token_list = token_list[-max_seq_len:]  # keep last max_seq_len tokens
        token_list = np.array(token_list).reshape(1, -1)
        token_list = tf.keras.preprocessing.sequence.pad_sequences(token_list, maxlen=max_seq_len, padding='pre')
        
        # Predict the probability distribution of the next word
        predicted_probs = model.predict(token_list, verbose=0)[0]
        
        # Pick the word with highest probability (greedy)
        predicted_id = np.argmax(predicted_probs)
        
        # Convert predicted_id back to word
        output_word = tokenizer.index_word.get(predicted_id, '')
        
        # Append predicted word to output_text
        output_text += ' ' + output_word
        
    return output_text

# Example usage:
seed = "the movie"
generated = generate_text(seed_text=seed, next_words=15, max_seq_len=20)
print(generated)


NameError: name 'tokenizer' is not defined