In [108]:
import nltk
import pandas as pd

nltk.download('gutenberg')
from nltk.corpus import gutenberg

data = gutenberg.raw('shakespeare-hamlet.txt')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\vinod\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [109]:
with open ('hamlet.txt','w') as file:
    file.write(data)

In [110]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
with open('hamlet.txt','r') as f :
    data = f.read().lower()

In [113]:
tokenizer = Tokenizer()

In [114]:
tokenizer.fit_on_texts([data])
vocab_size = len(tokenizer.word_index)+1



In [115]:
def encoding(data):
    data = [x for x in data.split('\n') if len(x) > 0]
    data = tokenizer.texts_to_sequences(data)
    return data


In [116]:
data_seq = encoding(data)

In [117]:
max_len = max([len(x) for x in data_seq])

In [118]:
data_seq = pad_sequences(data_seq,maxlen=max_len)

In [119]:
X,y = data_seq[:,:-1],data_seq[:,-1]

In [120]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=vocab_size)


In [121]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [122]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

model = Sequential()
model.add(Embedding(vocab_size,128,input_length = max_len))
model.add(LSTM(128,return_sequences = True))
model.add(Dropout(0.25))
model.add(LSTM(64))
model.add(Dense(vocab_size,activation = 'softmax'))




In [123]:
model.summary()

In [124]:
model.compile(
    optimizer = 'adam',
    loss = 'categorical_crossentropy',
    metrics = ['accuracy']
)

In [155]:
from tensorflow.keras.callbacks import EarlyStopping
earlyStopping = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)

history = model.fit(
    X_train,y_train,
    epochs = 200,
    validation_data = (X_test,y_test),
    callbacks = [earlyStopping]
)

Epoch 1/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9178 - loss: 0.5354 - val_accuracy: 0.0343 - val_loss: 15.3730
Epoch 2/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.9178 - loss: 0.5214 - val_accuracy: 0.0383 - val_loss: 15.3541
Epoch 3/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9202 - loss: 0.4893 - val_accuracy: 0.0353 - val_loss: 15.3657
Epoch 4/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.9083 - loss: 0.5453 - val_accuracy: 0.0413 - val_loss: 15.3874
Epoch 5/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9109 - loss: 0.5171 - val_accuracy: 0.0363 - val_loss: 15.3810
Epoch 6/200
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9145 - loss: 0.5124 - val_accuracy: 0.0353 - val_loss: 15.4223
Epoch 7/200
[1m93/93

In [None]:
rev_voc = {value:key for (key,value) in tokenizer.word_index.items()}
rev_voc[0] = ''
import numpy as np
def prediction(text):
    ip = text
    if (len(text) < max_len):
        return None
    text.lower()
    text = tokenizer.texts_to_sequences([text])
    text = pad_sequences(text,maxlen=max_len)
    pred = np.array(model.predict(text))
    m = np.argmax(pred,axis=1)[0]
    output = rev_voc[m]
    print(ip,output)
    return pred

prediction("O farwel honest Soldier, who hath relieu'd")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
O farwel honest Soldier, who hath relieu'd you


array([[3.7746931e-10, 7.0314898e-05, 4.1044569e-03, ..., 4.0931417e-10,
        4.0141110e-10, 4.2029186e-10]], dtype=float32)

In [157]:
model.save("word_predictor.keras")

In [None]:
prediction("To be or not to be")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
To be or not to be another


array([[4.4030110e-10, 5.9888463e-09, 3.7455271e-07, ..., 3.9820788e-10,
        4.6224025e-10, 5.3495164e-10]], dtype=float32)

In [160]:
prediction("Mar. Horatio saies, 'tis but our")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Mar. Horatio saies, 'tis but our fantasie


array([[6.4814570e-10, 3.8222112e-10, 1.4966837e-08, ..., 6.7168310e-10,
        6.9091538e-10, 7.6504647e-10]], dtype=float32)