In [1]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

# Load the IMDB dataset
from tensorflow.keras.datasets import imdb

In [2]:
# Parameters
VOCAB_SIZE = 5000  # Limit the vocabulary size to the top 5000 words
MAX_SEQUENCE_LEN = 5  # We'll use sequences of length 5
EMBEDDING_DIM = 50
EPOCHS = 90  # Number of training epochs
NUM_REVIEWS = 1000  # Limit to first 1000 reviews
MAX_SEQUENCES = 100000  # Limit the total number of sequences

# Load the data
print("Loading IMDB dataset...")
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=VOCAB_SIZE)

# Use only a subset of the data to limit memory usage
X_train = X_train[:NUM_REVIEWS]

# Decode the sequences back to words
index_word = imdb.get_word_index()
word_index = {k: (v + 3) for k, v in index_word.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
index_word = {v: k for k, v in word_index.items()}

def decode_review(text):
    return ' '.join([index_word.get(i, '?') for i in text])

# Prepare a corpus of reviews
print("Preparing corpus...")
corpus = []
for sequence in X_train:
    decoded = decode_review(sequence)
    corpus.append(decoded)

# Tokenize the corpus
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(corpus)

total_words = VOCAB_SIZE

# Create sequences
print("Creating sequences...")
input_sequences = []
sequence_counter = 0

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(2, len(token_list)):
        n_gram_sequence = token_list[i - MAX_SEQUENCE_LEN:i + 1]
        if len(n_gram_sequence) == MAX_SEQUENCE_LEN + 1:
            input_sequences.append(n_gram_sequence)
            sequence_counter += 1
            if sequence_counter >= MAX_SEQUENCES:
                break
    if sequence_counter >= MAX_SEQUENCES:
        break

print(f"Total sequences: {len(input_sequences)}")

# Convert to numpy arrays and split into features and labels
input_sequences = np.array(input_sequences)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# One-hot encode the labels
y = to_categorical(y, num_classes=total_words)

# Build the model
print("Building the model...")
model = Sequential()
model.add(Embedding(total_words, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LEN))
model.add(LSTM(128))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
print("Training the model...")
history = model.fit(X, y, epochs=EPOCHS, verbose=1)

# Evaluate the model
print("Evaluating the model...")
loss, accuracy = model.evaluate(X, y, verbose=0)
print(f"Model accuracy: {accuracy * 100:.2f}%")

# Function to predict the next word
def predict_next_word(model, tokenizer, text):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LEN, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted_probs, axis=-1)
    output_word = tokenizer.index_word.get(predicted[0], '')
    return output_word

# Test the model
seed_texts = [
    "the movie was",
    "i really love",
    "this film is",
    "it is a",
    "the acting was"
]

print("\nNext word predictions:")
for seed_text in seed_texts:
    next_word = predict_next_word(model, tokenizer, seed_text)
    print(f"{seed_text} -> {next_word}")

Loading IMDB dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Preparing corpus...
Creating sequences...
Total sequences: 100000
Building the model...




Training the model...
Epoch 1/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 6ms/step - accuracy: 0.1077 - loss: 6.2742
Epoch 2/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 5ms/step - accuracy: 0.1402 - loss: 5.6065
Epoch 3/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - accuracy: 0.1599 - loss: 5.2893
Epoch 4/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - accuracy: 0.1726 - loss: 5.0505
Epoch 5/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.1800 - loss: 4.8537
Epoch 6/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - accuracy: 0.1886 - loss: 4.6562
Epoch 7/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - accuracy: 0.1973 - loss: 4.4502
Epoch 8/90
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 5ms/step - accuracy: 0.2091 - loss: 4

In [3]:
import pickle
# Save the model
model.save('next_word_model.h5')
print("\nModel saved as 'next_word_model.h5'")

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print("Tokenizer saved as 'tokenizer.pkl'")




Model saved as 'next_word_model.h5'
Tokenizer saved as 'tokenizer.pkl'


In [7]:
# Cell 2: Loading the Model and Tokenizer for Testing and Demonstration

# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle  # For loading the tokenizer

# Parameters
VOCAB_SIZE = 5000  # Ensure this matches the value used during training
MAX_SEQUENCE_LEN = 5  # Ensure this matches the value used during training

# Load the saved model
model = load_model('next_word_model.h5')
print("Model loaded from 'next_word_model.h5'")

# Load the saved tokenizer
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)
print("Tokenizer loaded from 'tokenizer.pkl'")

# Function to predict the next word
def predict_next_word(model, tokenizer, text):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=MAX_SEQUENCE_LEN, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)
    predicted = np.argmax(predicted_probs, axis=-1)
    output_word = tokenizer.index_word.get(predicted[0], '')
    return output_word

# Test the model with some seed texts
seed_texts = [
    "the movie was",
    "i really love",
    "this film is",
    "it is a",
    "the acting was"
]

print("\nNext word predictions:")
for seed_text in seed_texts:
    next_word = predict_next_word(model, tokenizer, seed_text)
    print(f"{seed_text} -> {next_word}")

# Interactive testing
print("\nType a seed text to predict the next word (or 'exit' to quit):")
while True:
    seed_text = input("Enter seed text: ")
    if seed_text.lower() == 'exit':
        break
    next_word = predict_next_word(model, tokenizer, seed_text)
    print(f"{seed_text} -> {next_word}")



Model loaded from 'next_word_model.h5'
Tokenizer loaded from 'tokenizer.pkl'

Next word predictions:
the movie was -> less
i really love -> the
this film is -> so
it is a -> very
the acting was -> going

Type a seed text to predict the next word (or 'exit' to quit):
Enter seed text: The movie is
The movie is -> a
Enter seed text: exit
