In [28]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [None]:
data = gutenberg.raw('shakespeare-hamlet.txt')
with open('hamlet.txt','w') as file:
  file.write(data)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

with open('hamlet.txt','r') as file:
  text = file.read().lower()


'''Creating text indexes '''


tokenization = Tokenizer()
tokenization.fit_on_texts([text])
total_words = len(tokenization.word_index) + 1
total_words

4818

In [None]:
'''
Splitting of lines and making n_gram_sequences
'''

input_sequences = []
for line in text.split('\n'):
  token_list = tokenization.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)


In [None]:
max_sequence_len = max(len(x) for x in input_sequences)
max_sequence_len

14

In [None]:
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
padded_sequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]], dtype=int32)

In [None]:
x,y=padded_sequences[:,:-1],padded_sequences[:,-1]

In [None]:
import tensorflow as tf
y = tf.keras.utils.to_categorical(y,num_classes=total_words)

In [None]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size = 0.2)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history=model.fit(x_train,y_train,epochs=100,verbose=1,callbacks=[early_stopping])

Epoch 1/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 7ms/step - accuracy: 0.0300 - loss: 7.1418
Epoch 2/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.0354 - loss: 6.4471
Epoch 3/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.0404 - loss: 6.3134
Epoch 4/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.0476 - loss: 6.2011
Epoch 5/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.0543 - loss: 6.0813
Epoch 6/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - accuracy: 0.0588 - loss: 5.9274
Epoch 7/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 0.0733 - loss: 5.7685
Epoch 8/100
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.0785 - loss: 5.6191
Epoch 9/100
[1m644/644[0m 

In [None]:
def predict_next_word(model,tokenization,text,max_sequence_len):
  token_list = tokenization.texts_to_sequences([text])[0]
  if len(token_list) >= max_sequence_len - 1:
    token_list = token_list[-(max_sequence_len - 1):]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
  predicted = model.predict([token_list],verbose=0)
  predicted_word_index = np.argmax(predicted)
  for word,index in tokenization.word_index.items():
    if index == predicted_word_index:
      return word
  return None


In [None]:
input_text=str('Thou art a Scholler; speake to it')
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenization,input_text,max_sequence_len)
print(next_word)

horatio


In [None]:
model.save('lstm_next_word.h5')
import pickle
with open('tokenizer.pickle','wb') as handle:
  pickle.dump(tokenization,handle,protocol=pickle.HIGHEST_PROTOCOL)



ModuleNotFoundError: No module named 'streamlit'