In [14]:
import nltk
nltk.download('brown')
from nltk.corpus import brown

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [15]:
sentences = brown.sents(categories='science_fiction')  # pick category
text = " ".join([" ".join(sent).lower() for sent in sentences])
words = text.split()

In [16]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(words)
total_words = len(tokenizer.word_index) + 1
print("Vocabulary size:", total_words)


Vocabulary size: 3016


In [17]:
input_sequences = []
for i in range(1, len(words)):
    encoded = tokenizer.texts_to_sequences([words[:i+1]])[0]
    input_sequences.append(encoded)


In [18]:
max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


In [20]:
seq_len = X.shape[1]
vocab_size = total_words

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_shape=(seq_len,)),
    LSTM(100),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build(input_shape=(None, seq_len))
model.summary()


In [21]:
model.fit(X, y, epochs=10, batch_size=128, verbose=2)


Epoch 1/10
114/114 - 84s - 733ms/step - accuracy: 0.1550 - loss: 6.4824
Epoch 2/10
114/114 - 83s - 726ms/step - accuracy: 0.1572 - loss: 5.8619
Epoch 3/10
114/114 - 82s - 724ms/step - accuracy: 0.1572 - loss: 5.7598
Epoch 4/10
114/114 - 83s - 725ms/step - accuracy: 0.1569 - loss: 5.6940
Epoch 5/10
114/114 - 83s - 724ms/step - accuracy: 0.1545 - loss: 5.6328
Epoch 6/10
114/114 - 83s - 725ms/step - accuracy: 0.1567 - loss: 5.5657
Epoch 7/10
114/114 - 83s - 724ms/step - accuracy: 0.1639 - loss: 5.4928
Epoch 8/10
114/114 - 83s - 725ms/step - accuracy: 0.1665 - loss: 5.4230
Epoch 9/10
114/114 - 83s - 725ms/step - accuracy: 0.1687 - loss: 5.3467
Epoch 10/10
114/114 - 83s - 726ms/step - accuracy: 0.1726 - loss: 5.2654


<keras.src.callbacks.history.History at 0x7a2160d0bfb0>

In [22]:
def generate_text(seed_text, next_words=20, temperature=1.0):
    result = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences(result.lower().split())
        token_list = [t[0] for t in token_list if t]
        token_list = pad_sequences([token_list], maxlen=X.shape[1], padding='pre')
        preds = model.predict(token_list, verbose=0)[0]

        # Temperature sampling
        preds = np.log(preds + 1e-7) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        next_index = np.random.choice(range(total_words), p=preds)
        next_word = tokenizer.index_word.get(next_index, "")
        result += " " + next_word
    return result


In [30]:
seed = "Science"
generated_text = generate_text(seed, next_words=15, temperature=0.8)
print("\nGenerated Text:\n", generated_text)


Generated Text:
 Science behave cruiser the indeed <OOV> <OOV> jubal not final to been sparse a intelligent as
