In [54]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import time
import zipfile
from tensorflow.keras.callbacks import EarlyStopping

In [55]:
#load and process data
def load_data(file_path):
  with zipfile.ZipFile(file_path, 'r') as zip_ref:
    text = zip_ref.read(zip_ref.namelist()[0]).decode('ISO-8859-1')
  return text

In [56]:
def create_sequences(text, tokenizer, max_sequence_len):
  input_sequences = []
  for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
      n_gram_sequence = token_list[:i+1]
      if len(n_gram_sequence) <= max_sequence_len:
        input_sequences.append(n_gram_sequence)
  return input_sequences

In [57]:
vocab_size = len(tokenizer.word_index) + 1

In [58]:
#padding the sequences
def preprocess_sequences(input_sequences, max_sequence_len):
  input_sequences = pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre')
  X = input_sequences[:, :-1]
  y = input_sequences[:, -1]
  y = to_categorical(y, num_classes = vocab_size)
  return X, y

In [65]:
#model building
def build_model(vocab_size, max_sequence_len):
  model = Sequential()
  model.add(Embedding(vocab_size, 100, input_length = max_sequence_len-1))
  model.add(LSTM(150, return_sequences=True))
  model.add(LSTM(100))
  model.add(Dense(vocab_size, activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [66]:
#load data
file_path = '/content/archive (1).zip'
text = load_data(file_path)

In [67]:
#tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [68]:
#create input sequences
input_sequences = create_sequences(text, tokenizer, max_sequence_len=5)
max_sequence_len = max([len(x) for x in input_sequences])

In [69]:
#preprocess sequences
X, y = preprocess_sequences(input_sequences, max_sequence_len)


In [70]:
#build model
model = build_model(total_words, max_sequence_len)



In [71]:
#train model
early_stop = EarlyStopping(monitor='loss', patience=5)
history = model.fit(X, y, epochs=100, verbose=1, callbacks=[early_stop])

Epoch 1/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 41ms/step - accuracy: 0.0889 - loss: 7.0735
Epoch 2/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 38ms/step - accuracy: 0.1198 - loss: 5.7895
Epoch 3/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 38ms/step - accuracy: 0.1512 - loss: 5.4148
Epoch 4/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 37ms/step - accuracy: 0.1718 - loss: 5.0843
Epoch 5/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 41ms/step - accuracy: 0.1866 - loss: 4.8683
Epoch 6/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 41ms/step - accuracy: 0.2034 - loss: 4.6090
Epoch 7/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 40ms/step - accuracy: 0.2119 - loss: 4.4539
Epoch 8/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - accuracy: 0.2193 - loss: 4.3037
Epoch 9/100
[1m

In [72]:
#prediction function
def predict_next_word(seed_text, model, tokenizer, max_sequence_len):
  for _ in range(5):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    predicted_word = tokenizer.index_word[predicted_word_index[0]]
    seed_text += " " + predicted_word
  return seed_text

In [75]:
#test prediction
seed_text = "What do you"
print(predict_next_word(seed_text, model, tokenizer, max_sequence_len))

What do you think of the weariness stood
