In [14]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import nltk
import re

In [15]:
data_file = 'holmes.txt'
with open(data_file, 'r', encoding='utf-8') as infile:
    data = infile.read()

In [16]:
data[:100]

"*Project Gutenberg's Etext of Tom Swift And His Submarine Boat*\n\n#4 in the Victor Appleton's Tom Swi"

In [17]:
data = data[:200000]

PREPROCESSING

In [18]:
def remove_characters(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(' +', ' ', text)
    return text

In [19]:
def preprocessing(data) -> 'list':
    sentences = data.split('\n')
    for i in range(len(sentences)):
        sentences[i] = remove_characters(sentences[i])
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    tokenized = []
    for sentence in sentences:
        sentence = sentence.lower()
        tokenized.append(sentence)
    return tokenized
    
tokenized_sentences = preprocessing(data)

In [20]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(tokenized_sentences)
total_words = len(tokenizer.word_index) + 1

In [21]:
input_sequences = []
for line in tokenized_sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

In [22]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [23]:
X,labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [24]:
from sklearn.model_selection import train_test_split
X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

TRAINING USING LSTM

In [25]:
model = Sequential()
model.add(Embedding(total_words, 100)) 
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))

adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)

Epoch 1/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 36ms/step - accuracy: 0.0816 - loss: 6.4157 - val_accuracy: 0.1219 - val_loss: 5.6868
Epoch 2/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - accuracy: 0.1493 - loss: 5.1244 - val_accuracy: 0.1408 - val_loss: 5.6593
Epoch 3/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 39ms/step - accuracy: 0.1885 - loss: 4.4672 - val_accuracy: 0.1427 - val_loss: 5.8203
Epoch 4/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 41ms/step - accuracy: 0.2326 - loss: 3.8947 - val_accuracy: 0.1380 - val_loss: 6.1859
Epoch 5/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 42ms/step - accuracy: 0.2553 - loss: 3.7844 - val_accuracy: 0.1352 - val_loss: 6.3968
Epoch 6/50
[1m794/794[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 42ms/step - accuracy: 0.3272 - loss: 3.0706 - val_accuracy: 0.1358 - val_loss: 6.6664
Epoch 7/50
[1m7

In [27]:
from tensorflow.keras.models import model_from_json

model_json = model.to_json()
with open("lstm_model.json", "w") as json_file:
    json_file.write(model_json)

In [28]:
from tensorflow.keras.models import model_from_json

with open("lstm_model.json", "r") as json_file:
    loaded_model_json = json_file.read()

loaded_model = model_from_json(loaded_model_json)



In [29]:
def predict_top_five_words(model, tokenizer, seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    top_five_indexes = np.argsort(predicted[0])[::-1][:5]
    top_five_words = []
    for index in top_five_indexes:
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                top_five_words.append(word)
                break
    return top_five_words

In [40]:

seed_text = "I have"
print(predict_top_five_words(model, tokenizer, seed_text))


['been', 'any', 'it', 'a', 'to']


In [36]:
seed_text = "what is"
print(predict_top_five_words(model, tokenizer, seed_text))

['no', 'the', 'mr', 'that', 'it']


In [37]:
seed_text = "How are"
print(predict_top_five_words(model, tokenizer, seed_text))

['you', 'he', 'capable', 'we', 'no']
