## Importing data from kaggle

---



In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [5]:
!kaggle datasets download -d  ashishpandey2062/next-word-predictor-text-generator-dataset

Dataset URL: https://www.kaggle.com/datasets/ashishpandey2062/next-word-predictor-text-generator-dataset
License(s): MIT
Downloading next-word-predictor-text-generator-dataset.zip to /content
  0% 0.00/61.5k [00:00<?, ?B/s]
100% 61.5k/61.5k [00:00<00:00, 223MB/s]


In [6]:
import zipfile
zip_ref = zipfile.ZipFile('/content/next-word-predictor-text-generator-dataset.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [7]:
with open('/content/next_word_predictor.txt', 'r') as file:
    text_data = file.read()
    text_words = text_data.split()

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

## Converting words into tokens (integer values)

In [9]:
tokenizer = Tokenizer()

In [10]:
tokenizer.fit_on_texts(text_words)

In [11]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'i': 6,
 'you': 7,
 'in': 8,
 'is': 9,
 'monica': 10,
 'it': 11,
 'with': 12,
 'ross': 13,
 'that': 14,
 'rachel': 15,
 'for': 16,
 'chandler': 17,
 'this': 18,
 'on': 19,
 'joey': 20,
 'was': 21,
 'oh': 22,
 'phoebe': 23,
 'are': 24,
 'all': 25,
 'as': 26,
 'what': 27,
 'be': 28,
 'like': 29,
 'no': 30,
 "it's": 31,
 "i'm": 32,
 'her': 33,
 'they': 34,
 'just': 35,
 'from': 36,
 'okay': 37,
 'not': 38,
 'so': 39,
 'my': 40,
 'have': 41,
 'me': 42,
 'where': 43,
 'know': 44,
 'she': 45,
 'we': 46,
 'out': 47,
 'well': 48,
 'their': 49,
 'can': 50,
 'at': 51,
 'he': 52,
 'yeah': 53,
 'your': 54,
 'about': 55,
 'but': 56,
 'its': 57,
 'up': 58,
 "don't": 59,
 'text': 60,
 'scene': 61,
 'by': 62,
 'do': 63,
 'an': 64,
 'or': 65,
 'were': 66,
 'there': 67,
 'if': 68,
 'uh': 69,
 'look': 70,
 'life': 71,
 'through': 72,
 'into': 73,
 'him': 74,
 'his': 75,
 "you're": 76,
 'hey': 77,
 'how': 78,
 'right': 79,
 'think': 80,
 'time': 81,
 'no

In [12]:
input_sequences = []

for sentence in text_data.split('\n'):
  if sentence.strip() != '':
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokenized_sentence)):
      input_sequences.append(tokenized_sentence[:i+1])

In [13]:
max_len = max([len(x) for x in input_sequences])

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

## Splitting sentences into input and output.

In [15]:
X = padded_input_sequences[:,:-1]

In [16]:
y = padded_input_sequences[:, -1]

In [17]:
X.shape

(26383, 324)

In [18]:
y.shape

(26383,)

In [19]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=len(tokenizer.word_index)+1)

In [20]:
y.shape

(26383, 4994)

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [22]:
model = Sequential()
model.add(Embedding(4994, 100, input_shape = (max_len-1,)))
model.add(LSTM(150))
model.add(Dense(4994, activation='softmax'))

  super().__init__(**kwargs)


In [23]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [24]:
model.summary()

In [25]:
model.fit(X, y, epochs=150)

Epoch 1/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - accuracy: 0.0495 - loss: 7.2800
Epoch 2/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.0747 - loss: 6.4302
Epoch 3/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.0940 - loss: 5.9325
Epoch 4/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.1151 - loss: 5.5535
Epoch 5/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 16ms/step - accuracy: 0.1402 - loss: 5.1501
Epoch 6/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.1574 - loss: 4.7688
Epoch 7/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.1859 - loss: 4.3933
Epoch 8/150
[1m825/825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.2209 - loss: 4.0581
Epoch 9/150
[1m

<keras.src.callbacks.history.History at 0x7e2a958d49e0>

In [26]:
text = "Please"

In [27]:
import numpy as np

for i in range (15):
  token_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([token_text], maxlen = max_len, padding='pre')
  pos = np.argmax(model.predict(padded_token_text))

  for word, index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 228ms/step
Please let
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me know
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Please let me know if
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me know if you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Please let me know if you have
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me know if you have any
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me know if you have any other
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Please let me know if you have any other requests
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Please let me know if you have