In [1]:
import nltk
from nltk.corpus import gutenberg
import pandas as pd

In [2]:
import nltk
nltk.download('gutenberg')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [3]:
data = gutenberg.raw('shakespeare-hamlet.txt')
with open('dataset.txt','w') as f:
    f.write(data)

In [4]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [5]:
# read the data set
with open("dataset.txt", "r") as file:
    text = file.read().lower()


In [6]:
Tokenizer = Tokenizer()
Tokenizer.fit_on_texts([text])
total_words = len(Tokenizer.word_index) + 1


In [7]:
input_sequences = []
for line in text.split('\n'):
    token_list = Tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

In [8]:
max_seq_len = max([len(x) for x in input_sequences])
max_seq_len

14

In [9]:
input_seq = np.array(pad_sequences
                     (input_sequences,
                      maxlen=max_seq_len, 
                      padding='pre'))

In [10]:
input_seq

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [11]:
max_seq_len = max([len(x) for x in input_sequences])
max_seq_len

14

In [12]:
input_seq = np.array(pad_sequences
                     (input_sequences,
                      maxlen=max_seq_len, 
                      padding='pre'))

In [13]:
input_seq

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [14]:
##create predictors and label
import tensorflow as tf
x,y = input_seq[:,:-1],input_seq[:,-1]

In [15]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [17]:
## define early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3,verbose = 1 ,restore_best_weights=True)

In [18]:
#lstm model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,GRU
model1 = Sequential()
model1.add(Embedding(total_words, 64, input_length=max_seq_len-1))
model1.add(LSTM(150, return_sequences=True))
model1.add(Dropout(0.2))
model1.add(LSTM(100))
model1.add(Dense(total_words, activation='softmax'))
model1.build(input_shape=(None, max_seq_len-1))
model1.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model1.summary()



In [19]:
#lstm model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,GRU
model2 = Sequential()
model2.add(Embedding(total_words, 100, input_length=max_seq_len-1))
model2.add(GRU(150, return_sequences=True))
model2.add(Dropout(0.2))
model2.add(GRU(100))
model2.add(Dense(total_words, activation='softmax'))
model2.build(input_shape=(None, max_seq_len-1))
model2.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

In [22]:
history1 = model1.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), callbacks=[early_stopping])

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0512 - loss: 6.0938 - val_accuracy: 0.0455 - val_loss: 6.9391
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 27ms/step - accuracy: 0.0587 - loss: 5.9125 - val_accuracy: 0.0519 - val_loss: 7.0210
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.0624 - loss: 5.8048 - val_accuracy: 0.0515 - val_loss: 7.0790
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 27ms/step - accuracy: 0.0674 - loss: 5.6978 - val_accuracy: 0.0540 - val_loss: 7.1382
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 1.


In [23]:
history2 = model2.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test), callbacks=[early_stopping])

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 30ms/step - accuracy: 0.0310 - loss: 7.1717 - val_accuracy: 0.0344 - val_loss: 6.8200
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.0352 - loss: 6.4761 - val_accuracy: 0.0486 - val_loss: 6.8190
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.0532 - loss: 6.1811 - val_accuracy: 0.0505 - val_loss: 6.8262
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0622 - loss: 5.9617 - val_accuracy: 0.0628 - val_loss: 6.8110
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.0703 - loss: 5.7575 - val_accuracy: 0.0672 - val_loss: 6.8554
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - accuracy: 0.0909 - loss: 5.4238 - val_accuracy: 0.0670 - val_loss: 6.9358
Epoch 7/50
[1m6

In [26]:
## function to predict the next word 

def predict_next_word(model, tokenizer,text, max_seq_len): 
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_seq_len:
        token_list = token_list[-max_seq_len:]
    token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
    predicted = model.predict(token_list)
    predicted_word_index = np.argmax(predicted, axis=-1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word

In [27]:
input_text = "to be or not to be"
max_seq_len = model1.input_shape[1] + 1
next_word1 = predict_next_word(model1, Tokenizer, input_text, max_seq_len)
print(next_word1)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 346ms/step
a
