In [28]:
# another try on text prediction
# https://medium.com/analytics-vidhya/build-a-simple-predictive-keyboard-using-python-and-keras-b78d3c88cffb

**Load libraries**

In [10]:
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import heapq
import requests

**Where you find the text**

In [8]:
url = "https://www.gutenberg.org/files/1661/1661-0.txt"

In [11]:
response = requests.get(url)
text = response.text

In [12]:
text[:100]

'ï»¿The Project Gutenberg eBook of The Adventures of Sherlock Holmes, by Arthur Conan Doyle\r\n\r\nThis e'

In [29]:
print('text length:', len(text))

text length: 607430


**Tokenizer**

In [14]:
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

In [16]:
words[:10]

['ï',
 'The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'The',
 'Adventures',
 'of',
 'Sherlock']

In [17]:
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

In [18]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['ï', 'The', 'Project', 'Gutenberg', 'eBook']
of


**Prepare the matrix input**

In [19]:
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

In [20]:
print(X[0][0])

[False False False ... False False  True]


**Construct model**

In [21]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

**Fit the model**

In [22]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history

  super(RMSprop, self).__init__(name, **kwargs)


Epoch 1/2
Epoch 2/2


**Prepare the input for prediction**

In [23]:
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x

prepare_input("It is not a lack".lower())

it
is
not
a
lack


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

Select top 3 words

In [24]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [25]:
def predict_completions(text, n=3):
    if text == "":
        return("0")
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [26]:
q =  "Your life will never be the same again"
print("correct sentence: ",q)

correct sentence:  Your life will never be the same again


**Guess or prediction**

In [27]:
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", predict_completions(seq, 5))

Sequence:  your life will never be
your
life
will
never
be
next possible words:  ['so', 'â', 'of', 'no', 'to']
