In [4]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import model_from_json
import nltk


In [2]:
#open data file
with open("data/holmes.txt", 'r', encoding='utf-8') as f:
  data = f.read()
  print(data[:500])

*Project Gutenberg's Etext of Tom Swift And His Submarine Boat*

#4 in the Victor Appleton's Tom Swift Series

We name these Etext files as they are numbered in the books,

i.e. This is #4 in the series so the file name is 04tomxxx.xxx,

where the x's are place holders for editon # and file type such

as 04tom10.txt and 04tom10.zip, when we do a .htm, 04tom10h.htm





Copyright laws are changing all over the world, be sure to check

the copyright laws for your country before posting these files


In [3]:
#removal of special characters
def removeSpecialChar(data):
  #remove special characters
  data = re.sub(r'[^a-zA-Z0-9\s]', "", data)
  #remove extra spaces
  data = re.sub(" +", " ", data)

  
  return data
  

In [4]:
#Preprocessing
def preprocess(data):
  sentences = data.split("\n")
  for i in range(len(sentences)):
    sentences[i] = removeSpecialChar(sentences[i])
  
  sentences = [s.strip() for s in sentences]
  #drop empty sentences
  sentences = [ s for s in sentences if len(sentences)> 0]
  tokenizer = []
  for s in sentences:
    s = s.lower()
    tokenizer.append(s)
  return tokenizer

In [5]:
Tokenized_sentence = preprocess(data)

In [6]:
#Tokenize words
tokenizer = Tokenizer(oov_token ='<oov>')
tokenizer.fit_on_texts(Tokenized_sentence)
total_words = len(tokenizer.word_index) + 1


In [8]:
tokenizer.word_index

{'<oov>': 1,
 'the': 2,
 'and': 3,
 'of': 4,
 'to': 5,
 'a': 6,
 'in': 7,
 'i': 8,
 'he': 9,
 'was': 10,
 'that': 11,
 'it': 12,
 'his': 13,
 'you': 14,
 'with': 15,
 'had': 16,
 'for': 17,
 'as': 18,
 'her': 19,
 'she': 20,
 'but': 21,
 'at': 22,
 'not': 23,
 'is': 24,
 'on': 25,
 'be': 26,
 'him': 27,
 'my': 28,
 'have': 29,
 'they': 30,
 'by': 31,
 'said': 32,
 'this': 33,
 'me': 34,
 'all': 35,
 'from': 36,
 'were': 37,
 'which': 38,
 'so': 39,
 'or': 40,
 'one': 41,
 'if': 42,
 'there': 43,
 'we': 44,
 'no': 45,
 'when': 46,
 'an': 47,
 'would': 48,
 'their': 49,
 'what': 50,
 'them': 51,
 'who': 52,
 'been': 53,
 'out': 54,
 'are': 55,
 'up': 56,
 'then': 57,
 'could': 58,
 'do': 59,
 'will': 60,
 'into': 61,
 'more': 62,
 'your': 63,
 'now': 64,
 'man': 65,
 'very': 66,
 'little': 67,
 'upon': 68,
 'some': 69,
 'about': 70,
 'its': 71,
 'time': 72,
 'like': 73,
 'than': 74,
 'did': 75,
 'any': 76,
 'mr': 77,
 'other': 78,
 'see': 79,
 'well': 80,
 'know': 81,
 'before': 82,
 'do

In [2]:
#creating sequence
sequences = []
for s in Tokenized_sentence:
  list_ = tokenizer.texts_to_sequences([s])[0]
  for i in range(1,len(list_)):
    n_gram_seq = list_[:i+1]
    sequences.append(n_gram_seq)
    
#padding seq
max_seq_len = max([len(x) for x in sequences])
input_seq = np.array(pad_sequences(sequences, maxlen=max_seq_len, padding='pre'))

NameError: name 'Tokenized_sentence' is not defined

In [1]:
X , labels = input_seq[:,:-1], input_seq[:, -1]
Y = tf.keras.utils.to_categorical(labels, num_classes=total_words)

NameError: name 'input_seq' is not defined

In [None]:
Y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=2)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=2)



In [None]:
#LSTM model
model = Sequential()
model.add(Embedding(total_words),100)
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
optimizer = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=20, validation_data=(X_val,y_val), verbose=1)

In [None]:
#save the model
model = model.to_json()
with open("sentComLstm.json", 'w') as file:
  file.write(model)

In [None]:
with open("sentComLstm.json", 'r') as model_file:
  model_json = model_file.read()
  model = model_from_json(model_json)

In [None]:

plt.plot(history.history['loss'], label='Train_loss')
plt.plot(history.history['val_loss'], label='Validation_loss')
plt.title('Train v/s Validation Loss')
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.show()

plt.plot(history.history['accuracy'], label='Train_accuracy')
plt.plot(history.history['val_accuracy'], label='Validation_accuracy')
plt.title('Train v/s Validation Loss')
plt.ylabel("Accuracy")
plt.xlabel("Epochs")
plt.show()

In [None]:
def predict_next(model=model, query_text):
  list_ = tokenizer.text_to_sequence([query_text])[0]
  token = pad_sequences([list_], maxlen= max_seq_len, padding ='pre')
  predict = model.predict(token, verbose =1)
  print("predict[0]: ",predict[0])
  top_five_index = np.argsort(predict[0])[::-1][:5]
  top_words = []
  for i in top_five_index:
    for word, idx in tokenizer.word_index.items():
      if idx == i:
        top_words.append(word)
        break
  return top_words

In [None]:
def display_result(query_text):
  words = predict_next(model, query_text)
  results = []
  for word in words:
    results.append(f'{query_text} {word}')
  return results

In [None]:
print("Expected result: ", y_test[10], "Observed result: ", display_result(X_test[10]))