In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import numpy as np
import pickle

In [None]:
from google.colab import files
uploaded = files.upload()

Saving data.txt to data (1).txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
print(os.listdir())


['.config', 'data.txt', 'data (1).txt', 'sample_data']


In [None]:
# Load the dataset
file_path = '/content/data.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    data = file.read()

In [None]:
# Clean and preprocess the text
data = data.lower().replace('\n', ' ').replace('\r', '')

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# Save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

# Convert text to sequences
sequence_data = tokenizer.texts_to_sequences([data])[0]
vocab_size = len(tokenizer.word_index) + 1

In [None]:
sequence_length = 3  # Number of previous words to predict the next one
X, y = [], []

for i in range(sequence_length, len(sequence_data)):
    X.append(sequence_data[i-sequence_length:i])
    y.append(sequence_data[i])
X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=sequence_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))



In [None]:
model.fit(X, y, epochs=5, batch_size=64)

Epoch 1/5
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 36ms/step - loss: 6.8846
Epoch 2/5
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 36ms/step - loss: 6.1997
Epoch 3/5
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 35ms/step - loss: 5.8748
Epoch 4/5
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 35ms/step - loss: 5.6182
Epoch 5/5
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 35ms/step - loss: 5.4008


<keras.src.callbacks.history.History at 0x7a3022375bb0>

In [None]:
def predict_next_word(model, tokenizer, input_text, sequence_length):
    # Convert the input text to a sequence of integers
    input_seq = tokenizer.texts_to_sequences([input_text])[0]

    # Pad the sequence
    if len(input_seq) < sequence_length:
        input_seq = [0] * (sequence_length - len(input_seq)) + input_seq

    input_seq = np.array(input_seq[-sequence_length:]).reshape(1, sequence_length)

    # Predict the next word
    predicted = model.predict(input_seq, verbose=0)
    return tokenizer.index_word[np.argmax(predicted)]

In [None]:
# Set the sequence length
sequence_length = 3

# Input your test text
input_text = "i am going to"
predicted_word = predict_next_word(model, tokenizer, input_text, sequence_length)

print(f"Input: '{input_text}'\nPredicted next word: '{predicted_word}'")

Input: 'i am going to'
Predicted next word: 'be'


In [None]:
test_sentences = [
    "I love ",
    "The cat is",
    "He said that",
    "They are going to",
    "Once upon a"
]

for sentence in test_sentences:
    predicted_word = predict_next_word(model, tokenizer, sentence, sequence_length)
    print(f"Input: '{sentence}'\nPredicted next word: '{predicted_word}'")

Input: 'I love '
Predicted next word: 'the'
Input: 'The cat is'
Predicted next word: 'the'
Input: 'He said that'
Predicted next word: 'i'
Input: 'They are going to'
Predicted next word: 'the'
Input: 'Once upon a'
Predicted next word: 'door'


In [None]:
model.save('next_word_model.keras')

In [None]:
test_sentences = [
    "how are",
    "what time",
    "where are",
    "you had"

]

for sentence in test_sentences:
    predicted_word = predict_next_word(model, tokenizer, sentence, sequence_length)
    print(f"Input: '{sentence}'\nPredicted next word: '{predicted_word}'")


Input: 'how are'
Predicted next word: 'a'
Input: 'what time'
Predicted next word: 'i'
Input: 'where are'
Predicted next word: 'the'
Input: 'you had'
Predicted next word: 'been'
