In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import pickle
import numpy as np
import os
import random

In [2]:
input_nextWord = pickle.load(open('../input/next-word-predictor/DS_5_train_input_nextWord','rb'))
output_prefixList = pickle.load(open('../input/next-word-predictor/DS_5_train_input_prefixList','rb'))
input_train = pickle.load(open('../input/next-word-predictor/DS_5_train_input','rb'))

# Tokenizer

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_train)

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

In [4]:
tokenizer.texts_to_sequences(['f'])

[[7]]

In [5]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

12


# Augmenting Dataset

In [6]:
def augment_dataset(output_prefixList, input_nextWord):
    next_words_list = []
    prev_words_list = []
    for string,word in zip(output_prefixList, input_nextWord):
        for j in range(len(string)-1):
            prev_words_list.append(string[:j+1])
            next_words_list.append(string[j+1])
        prev_words_list.append(string)
        next_words_list.append(word)
    return next_words_list,prev_words_list
    
next_words,prev_words = augment_dataset(output_prefixList, input_nextWord)

In [7]:
print(len(max(prev_words, key = len)))

48


# Tokenize inputs and outputs

In [8]:
sequence_prev_words = tokenizer.texts_to_sequences(prev_words)
sequence_next_words = tokenizer.texts_to_sequences(next_words)

In [9]:
print(sequence_prev_words[1])

[1, 5]


# Padding Inputs 

In [10]:
def pad(some_list, target_len):
    padded_prev_words = []
    for lst in some_list:
        padded_prev_words.append([0]*(target_len - len(lst)) + lst[:target_len])
    return padded_prev_words

input_sequence_length = 50
new_sequence_prev_words = pad(sequence_prev_words, input_sequence_length)

print(new_sequence_prev_words[42])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 3]


In [11]:
print(len(new_sequence_prev_words))
print(len(sequence_next_words))

134731
134731


# Splitting into testing and training data

In [12]:
sequence_next_words = np.array(sequence_next_words)
new_sequence_prev_words = np.array(new_sequence_prev_words)

random.shuffle(list(zip(new_sequence_prev_words, sequence_next_words)))

train_prev_words = new_sequence_prev_words[:134700]
test_prev_words = new_sequence_prev_words[134700:]
train_next_word = sequence_next_words[:134700]
test_next_word = sequence_next_words[134700:]

# The Model

In [13]:
model = Sequential()
model.add(Embedding(vocab_size, 200, input_length = input_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(128, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

2022-05-03 00:48:54.439413: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-03 00:48:54.539805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-03 00:48:54.540604: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-03 00:48:54.541735: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 200)           2400      
_________________________________________________________________
lstm (LSTM)                  (None, 50, 128)           168448    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 12)                1548      
Total params: 320,492
Trainable params: 320,492
Non-trainable params: 0
_________________________________________________________________


In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping

checkpoint = ModelCheckpoint("next_words.h5", monitor='val_accuracy', patience = 5, verbose=0, save_best_only=True, mode='max')
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='min')
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics = ['accuracy'])
model.fit(train_prev_words, train_next_word, epochs=50, batch_size=64, validation_split = 0.3, callbacks=[checkpoint, earlyStopping])

2022-05-03 00:48:57.713828: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50


2022-05-03 00:49:00.878062: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


<keras.callbacks.History at 0x7f986997fa90>

# Testing the model

In [16]:
model = keras.models.load_model('next_words.h5')
model.evaluate(test_prev_words, test_next_word)



[1.328513264656067, 0.3870967626571655]

# Regenerating Text

In [17]:
# Load the model and tokenizer
model = keras.models.load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

