In [1]:
import tensorflow as tf
import pandas as pd

In [2]:
!kaggle datasets download -d ashishpandey2062/next-word-predictor-text-generator-dataset

Dataset URL: https://www.kaggle.com/datasets/ashishpandey2062/next-word-predictor-text-generator-dataset
License(s): MIT
next-word-predictor-text-generator-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import zipfile
zip_dir = zipfile.ZipFile('/content/next-word-predictor-text-generator-dataset.zip')
zip_dir.extractall()
zip_dir.close()

In [4]:
with open('next_word_predictor.txt', 'r') as f:
  text = f.read()

In [5]:
text

'The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. Children were playing games, and laughter filled the air.\n\nAs the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. Families gathered for picnics, and the smell of barbecues wafted through the air. It was a perfect day for a picnic by the lake.\n\nIn the distance, you could hear the sound of live music coming from a local band, and people began to gather around the stage to enjoy the performance. The atmosphere was electric, and the music had everyone swaying to the beat.\n\nAs the stars began to twinkle in the night sky, the crowd grew even larger, and the festivities continued well into the night. It was a day filled with joy, laughter, and memories that would last a lifetime.\n\n\nT

In [6]:
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

In [7]:
word_index = tokenizer.word_index
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'i': 6,
 'you': 7,
 'in': 8,
 'is': 9,
 'monica': 10,
 'it': 11,
 'with': 12,
 'ross': 13,
 'that': 14,
 'rachel': 15,
 'for': 16,
 'chandler': 17,
 'this': 18,
 'on': 19,
 'joey': 20,
 'was': 21,
 'oh': 22,
 'phoebe': 23,
 'are': 24,
 'all': 25,
 'as': 26,
 'what': 27,
 'be': 28,
 'like': 29,
 'no': 30,
 "it's": 31,
 "i'm": 32,
 'her': 33,
 'they': 34,
 'just': 35,
 'from': 36,
 'okay': 37,
 'not': 38,
 'so': 39,
 'my': 40,
 'have': 41,
 'me': 42,
 'where': 43,
 'know': 44,
 'she': 45,
 'we': 46,
 'out': 47,
 'well': 48,
 'their': 49,
 'can': 50,
 'at': 51,
 'he': 52,
 'yeah': 53,
 'your': 54,
 'about': 55,
 'but': 56,
 'its': 57,
 'up': 58,
 "don't": 59,
 'text': 60,
 'scene': 61,
 'by': 62,
 'do': 63,
 'an': 64,
 'or': 65,
 'were': 66,
 'there': 67,
 'if': 68,
 'uh': 69,
 'look': 70,
 'life': 71,
 'through': 72,
 'into': 73,
 'him': 74,
 'his': 75,
 "you're": 76,
 'hey': 77,
 'how': 78,
 'right': 79,
 'think': 80,
 'time': 81,
 'no

In [8]:
len(word_index)

4993

In [9]:
max(len(line)for line in text.split('\n'))

1986

In [10]:
data = [data.replace('\n', '.')for data in text]
data = ''.join(data)
data

'The sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. People were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. Children were playing games, and laughter filled the air...As the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. Families gathered for picnics, and the smell of barbecues wafted through the air. It was a perfect day for a picnic by the lake...In the distance, you could hear the sound of live music coming from a local band, and people began to gather around the stage to enjoy the performance. The atmosphere was electric, and the music had everyone swaying to the beat...As the stars began to twinkle in the night sky, the crowd grew even larger, and the festivities continued well into the night. It was a day filled with joy, laughter, and memories that would last a lifetime....The ancien

In [11]:
max_len = max(len(line)for line in data.split('.'))
max_len

338

In [12]:
input_sequences = []
for line in text.split('.'):
    input_token = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(input_token)):
      n_grams = input_token[:i+1]
      input_sequences.append(n_grams)

In [13]:
input_sequences[:10]

[[1, 155],
 [1, 155, 21],
 [1, 155, 21, 2368],
 [1, 155, 21, 2368, 1549],
 [1, 155, 21, 2368, 1549, 8],
 [1, 155, 21, 2368, 1549, 8, 1],
 [1, 155, 21, 2368, 1549, 8, 1, 422],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215],
 [1, 155, 21, 2368, 1549, 8, 1, 422, 692, 215, 2]]

In [14]:
padded_sequence = pad_sequences(input_sequences, maxlen=max_len)
padded_sequence[:5]

array([[   0,    0,    0, ...,    0,    1,  155],
       [   0,    0,    0, ...,    1,  155,   21],
       [   0,    0,    0, ...,  155,   21, 2368],
       [   0,    0,    0, ...,   21, 2368, 1549],
       [   0,    0,    0, ..., 2368, 1549,    8]], dtype=int32)

In [15]:
X, y = padded_sequence[:, :-1], padded_sequence[:, -1]

In [16]:
X[0], y[0]

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [17]:
y = tf.keras.utils.to_categorical(y, num_classes=len(word_index)+1)

In [18]:
y[0], tf.argmax(y[0])

(array([0., 0., 0., ..., 0., 0., 0.]),
 <tf.Tensor: shape=(), dtype=int64, numpy=155>)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
len(X_train), len(X_test)

(20702, 5176)

In [20]:
len(X_train[3]), max_len

(337, 338)

In [21]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(None, ))

x = layers.Embedding(input_dim=len(word_index)+1 , output_dim=128)(inputs)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128)(x)
# x = layers.Dense(64, activation='relu')(x)

outputs = layers.Dense(len(word_index)+1, activation='softmax')(x)

model = tf.keras.Model(inputs, outputs)

In [22]:
model.summary()

In [23]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [24]:
model.fit(X_train, y_train,
          batch_size=32,
          validation_data=(X_test, y_test),
          epochs=100)

Epoch 1/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 31ms/step - accuracy: 0.0470 - loss: 7.3242 - val_accuracy: 0.0491 - val_loss: 7.0258
Epoch 2/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.0559 - loss: 6.6752 - val_accuracy: 0.0659 - val_loss: 6.9844
Epoch 3/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - accuracy: 0.0677 - loss: 6.3764 - val_accuracy: 0.0767 - val_loss: 6.9211
Epoch 4/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.0809 - loss: 6.1029 - val_accuracy: 0.0893 - val_loss: 6.9116
Epoch 5/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0992 - loss: 5.8079 - val_accuracy: 0.0914 - val_loss: 6.9633
Epoch 6/100
[1m647/647[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.1083 - loss: 5.5922 - val_accuracy: 0.0952 - val_loss: 6.9861
Epoch 7/10

<keras.src.callbacks.history.History at 0x7ae8bcfeba90>

In [25]:
model.save('Next_word_pred.keras')

In [26]:
import pickle
with open('Next_word_tokenizer.pkl', 'wb') as f:
  pickle.dump(tokenizer, f)

In [27]:
loaded_model = tf.keras.models.load_model('Next_word_pred.keras')
loaded_tokenizer = pickle.load(open('Next_word_tokenizer.pkl', 'rb'))

In [54]:
def pred_next_word(text, model, tokenizer, max_len=max_len):
  word_index = tokenizer.word_index
  input_token = tokenizer.texts_to_sequences([text])[0]
  padded_token = pad_sequences([input_token], max_len)
  pred = model.predict(padded_token).argmax()
  for word, index in word_index.items():
    if index == pred:
      return word, index

In [62]:
text = 'Children were playing '
pred_next_word(text, model, tokenizer, max_len)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step


('games', 2373)