In [2]:
# Step 1: Install and import necessary libraries
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np





[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\saive\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


In [3]:
# Step 2: Load text data
raw_text = gutenberg.raw('carroll-alice.txt')
raw_text = raw_text.lower().replace('\n', ' ')

print(raw_text[0:500])

[alice's adventures in wonderland by lewis carroll 1865]  chapter i. down the rabbit-hole  alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought alice 'without pictures or conversation?'  so she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy an


In [4]:
# Step 3: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([raw_text])
total_words = len(tokenizer.word_index) + 1



In [5]:
# Step 4: Create input sequences
token_list = tokenizer.texts_to_sequences([raw_text])[0]
input_sequences = []
sequence_length = 5

for i in range(sequence_length, len(token_list)):
    input_sequences.append(token_list[i-sequence_length:i+1])

input_sequences = np.array(input_sequences)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)




In [6]:
# Step 5: Build LSTM model
model = Sequential([
    Embedding(total_words, 64, input_length=sequence_length),
    LSTM(128),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, batch_size=256)



Epoch 1/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.0524 - loss: 7.2856
Epoch 2/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0609 - loss: 6.0825
Epoch 3/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0571 - loss: 5.9786
Epoch 4/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.0677 - loss: 5.8674
Epoch 5/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.0828 - loss: 5.7434
Epoch 6/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 29ms/step - accuracy: 0.0942 - loss: 5.6277
Epoch 7/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.0978 - loss: 5.5282
Epoch 8/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.1040 - loss: 5.4222
Epoch 9/20
[1m109/109[0m [32m

<keras.src.callbacks.history.History at 0x18b5d7026e0>

In [7]:
# Step 6: Prediction function
def predict_next_word(seed_text, next_words=1):
    for _ in range(next_words):
        token_seq = tokenizer.texts_to_sequences([seed_text])[0]
        token_seq = pad_sequences([token_seq], maxlen=sequence_length, padding='pre')
        predicted = np.argmax(model.predict(token_seq, verbose=0), axis=-1)[0]
        seed_text += " " + tokenizer.index_word[predicted]
    return seed_text

# Example use
print(predict_next_word("alice was beginning", next_words=5))

alice was beginning to the other thing '
