In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
imdb_reviews, _ = tf.keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [3]:
num_samples = 10000
imdb_reviews = imdb_reviews[:num_samples]

In [4]:
#convert reviews to strings
imdb_reviews= [' '.join(map(str, review)) for review in imdb_reviews[0]]

In [5]:
# Preprocess dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(imdb_reviews)
vocab_size = len(tokenizer.word_index) + 1

In [6]:
def data_generator(reviews, tokenizer, vocab_size, max_seq_length, batch_size):
    while True:
        for i in range(0, len(reviews), batch_size):
            batch_reviews = reviews[i:i+batch_size]
            batch_reviews = [' '.join(map(str, review)) for review in batch_reviews]  # Convert to list of strings
            sequences = tokenizer.texts_to_sequences(batch_reviews)
            sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='pre')
            X = sequences[:, :-1]
            y = tf.keras.utils.to_categorical(sequences[:, -1], num_classes=vocab_size)
            yield X, y

In [7]:
# Split data into train and validation sets
X_train, X_val = train_test_split(imdb_reviews, test_size=0.2, random_state=42)

In [8]:
# Define parameters
max_seq_length = max(len(review.split()) for review in imdb_reviews)
embedding_dim = 256
lstm_units = 128
batch_size = 32

In [9]:
# Define model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length-1),
    LSTM(units=lstm_units),
    Dense(vocab_size, activation='softmax')
])

In [10]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [11]:
# Train model using data generator
train_generator = data_generator(X_train, tokenizer,vocab_size, max_seq_length, batch_size)
steps_per_epoch = len(X_train) // batch_size
validation_steps = len(X_val) // batch_size

In [12]:
# Train model
model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=10, validation_data=train_generator, validation_steps=validation_steps)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7daf3cb9bfa0>

In [14]:
# Generate text
seed_text = "The sun"
for _ in range(100):
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=max_seq_length-1, truncating='pre')
    y_pred = model.predict(encoded)
    next_word = tokenizer.index_word[y_pred[0]]
    seed_text += " " + next_word
print(seed_text)



TypeError: unhashable type: 'numpy.ndarray'