In [63]:
faqs = """ Text Generation using LSTM (Next Word Prediction)

Natural language is sequential in nature because every
 word in a sentence depends on the words that come before it.
  For example, in the sentence I am going to the market each word follows a meaningful order.
  If we change the order randomly, the sentence will lose its meaning.
   This sequential dependency makes text data perfect for training an LSTM (Long Short-Term Memory) model.
    LSTM is a special type of Recurrent Neural Network (RNN) that is designed to learn long-term dependencies in sequential data.

In text generation tasks, the main objective is to predict the next word in a sentence based on previous words.
For example, if the input is I love to play , the model may predict the next word as  football or games depending on the training data.
The LSTM model learns patterns, grammar, and context from a large amount of text data.
It remembers important words from earlier in the sentence and uses that memory to make better predictions.
 This is why LSTM performs better than traditional RNNs for language-related tasks.

To build this model, the text data is first preprocessed.
 The sentences are converted into lowercase, punctuation is removed, and the text is tokenized (split into words).
  Each word is then converted into numerical form using tokenization and padding techniques so that the LSTM model can understand it.
   The model is trained on sequences of words where the input is a group of words, and the output is the next word.
   Over time, the model learns the probability of which word is most likely to appear next.

Text generation using LSTM has many real-world applications.
 It is used in chatbots, predictive text keyboards, story generation systems, and language translation tools.
  For example, when you type a message on your smartphone, the suggested next word is often predicted using models similar to LSTM.
   By training on a larger dataset, the model can generate longer and more meaningful sentences.

In conclusion, text generation is a simple and effective topic to implement LSTM because text naturally follows a sequential pattern.
 The dependency between words makes LSTM an ideal model for learning context and predicting future words.
  This project is beginner-friendly and can be easily implemented in Google Colab using libraries like TensorFlow or PyTorch.
"""

In [64]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [65]:
tokenizer = Tokenizer()

In [66]:
tokenizer.fit_on_texts([faqs])

In [67]:
len(tokenizer.word_index)

188

In [68]:
input_sequences = []
for sentence in faqs.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [69]:
input_sequences

[[3, 15],
 [3, 15, 16],
 [3, 15, 16, 4],
 [3, 15, 16, 4, 12],
 [3, 15, 16, 4, 12, 8],
 [3, 15, 16, 4, 12, 8, 53],
 [54, 24],
 [54, 24, 2],
 [54, 24, 2, 20],
 [54, 24, 2, 20, 5],
 [54, 24, 2, 20, 5, 55],
 [54, 24, 2, 20, 5, 55, 29],
 [54, 24, 2, 20, 5, 55, 29, 56],
 [8, 5],
 [8, 5, 6],
 [8, 5, 6, 17],
 [8, 5, 6, 17, 57],
 [8, 5, 6, 17, 57, 13],
 [8, 5, 6, 17, 57, 13, 1],
 [8, 5, 6, 17, 57, 13, 1, 11],
 [8, 5, 6, 17, 57, 13, 1, 11, 21],
 [8, 5, 6, 17, 57, 13, 1, 11, 21, 58],
 [8, 5, 6, 17, 57, 13, 1, 11, 21, 58, 59],
 [8, 5, 6, 17, 57, 13, 1, 11, 21, 58, 59, 22],
 [14, 25],
 [14, 25, 5],
 [14, 25, 5, 1],
 [14, 25, 5, 1, 17],
 [14, 25, 5, 1, 17, 30],
 [14, 25, 5, 1, 17, 30, 60],
 [14, 25, 5, 1, 17, 30, 60, 61],
 [14, 25, 5, 1, 17, 30, 60, 61, 9],
 [14, 25, 5, 1, 17, 30, 60, 61, 9, 1],
 [14, 25, 5, 1, 17, 30, 60, 61, 9, 1, 62],
 [14, 25, 5, 1, 17, 30, 60, 61, 9, 1, 62, 31],
 [14, 25, 5, 1, 17, 30, 60, 61, 9, 1, 62, 31, 8],
 [14, 25, 5, 1, 17, 30, 60, 61, 9, 1, 62, 31, 8, 32],
 [14, 25, 5, 

In [70]:
max_len = max([len(x) for x in input_sequences])

In [71]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [72]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,   3,  15],
       [  0,   0,   0, ...,   3,  15,  16],
       [  0,   0,   0, ...,  15,  16,   4],
       ...,
       [  0,   0,   0, ..., 185, 186, 187],
       [  0,   0,   0, ..., 186, 187,  46],
       [  0,   0,   0, ..., 187,  46, 188]], dtype=int32)

In [73]:
X = padded_input_sequences[:,:-1]

In [74]:
y = padded_input_sequences[:,-1]

In [75]:
X.shape

(363, 25)

In [76]:
y.shape

(363,)

In [77]:
from tensorflow.keras.utils import to_categorical
vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=vocab_size)

In [78]:
y.shape

(363, 189)

In [79]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Input(shape=(max_len-1,)),
    Embedding(vocab_size, 100),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [83]:
model.fit(X,y,epochs=100)

Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0791 - loss: 4.3991
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0650 - loss: 4.3262
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1151 - loss: 4.2578
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0969 - loss: 4.2313
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.1592 - loss: 4.0324
Epoch 6/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1245 - loss: 3.9810
Epoch 7/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1459 - loss: 3.8217
Epoch 8/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1428 - loss: 3.6853
Epoch 9/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b6a6ed5d3d0>

In [99]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

text= "If we change the order"
for i in range(5):

 token_text=tokenizer.texts_to_sequences([text])[0]
 padded_token_text = pad_sequences([token_text],
                                  maxlen=max_len-1,
                                  padding='pre')


 pos = np.argmax(model.predict(padded_token_text))

 for word, index in tokenizer.word_index.items():
  if index == pos:
   text = text +" "+word
   print(text)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
If we change the order randomly
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
If we change the order randomly the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
If we change the order randomly the sentence
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
If we change the order randomly the sentence will
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
If we change the order randomly the sentence will lose


In [None]:
tokenizer.word_index