In [None]:
import numpy as np
import heapq
import sys
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras
import pickle
import seaborn as sns
from pylab import rcParams

In [None]:
assert len(tf.config.list_physical_devices('GPU')) > 0

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

sns.set(style='whitegrid', palette='muted', font_scale=1.5)
rcParams['figure.figsize'] = 12, 5

In [None]:
# Read the text dataset

path = './dataset/train-text.txt'
text = open(path).read().lower()

print(f"Text corpus length: {len(text)}")

In [None]:
# Get all unique words from the corpus
# Create char to index and index to char mappings

chars = sorted(list(set(text)))

char2idx = dict((c, i) for i, c in enumerate(chars))

idx2char = dict((i, c) for i, c in enumerate(chars))

print(f"Unique character count: {len(chars)}")

In [None]:
# Create sequences of {SEQUENCE_LENGTH} characters
# Leaving 3 character space in between two sequences
# Create next sequence based on current sequence

SEQUENCE_LENGTH = 40
STEPS = 3

sentences = []
next_chars = []

for i in range(0, len(text) - SEQUENCE_LENGTH, STEPS):
    sentences.append(text[i : i + SEQUENCE_LENGTH])
    next_chars.append(text[i + SEQUENCE_LENGTH])

print(f"Number of training examples: {len(sentences)}")

In [None]:
X = np.zeros((len(sentences), SEQUENCE_LENGTH, len(chars)), dtype=np.float32)
y = np.zeros((len(sentences), len(chars)), dtype=np.float32)

for i, sentence in enumerate(sentences):
    for j, char in enumerate(sentence):
        X[i, j, char2idx[char]] = 1.
    y[i, char2idx[next_chars[i]]] = 1.

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

In [None]:
# Method to build LSTM model
# LSTM - 128 neurons; input_shape=({SEQUENCE_LENGTH}, {UNIQUE_CHARACTER_LENGTH})
# Dense - {UNIQUE_CHARACTER_LENGTH} neurons; activation=softmax

def build_lstm_model():
    lstm_model = tf.keras.Sequential([
        tf.keras.layers.LSTM(units=128),
        tf.keras.layers.Dense(len(chars), activation=tf.nn.softmax)
    ])
    
    return lstm_model

In [None]:
# Build and compile the model
# loss = sparse_categorical_crossentropy
# optimizer = RMSProp
# metrics = accuracy

model = build_lstm_model()

LR = 1e-2

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Train the model

BATCH_SIZE = 128
EPOCHS = 30

history = model.fit(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.05, shuffle=True).history

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left');

In [None]:
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left');

In [None]:
# Method to preprocess the input
def preprocess_inp(text):
    x = np.zeros((1, SEQUENCE_LENGTH, len(chars)), dtype=np.float32)
    
    for i, char in enumerate(text):
        x[0, i, char2idx[char]] = 1.
    
    return x


# Method to sample top_n outputs
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype(np.float32)
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    
    top_suggestions = heapq.nlargest(top_n, range(len(preds)), preds.take)
    
    return top_suggestions

In [None]:
# Method to predict output text repeatidily based on the input text
def predict_repeat(text):
    orig_text = text
    generated = text
    completion = ''
    
    while True:
        x = preprocess_inp(text)
        preds = model.predict(x, verbose=[0])[0]
        next_idx = sample(preds, top_n=1)[0]
        next_char = idx2char[next_idx]
        
        text = text[1:] + next_char
        completion += next_char
        
        if len(orig_text + completion) + 2 > len(orig_text) and next_char == ' ':
            return completion
        
def predict_completions(text, n=3):
    x = preprocess_inp(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [idx2char[idx] + predict_repeat(text[1:] + idx2char[idx]) for idx in next_indices]

In [None]:
quotes = [
    "It is not a lack of love, but a lack of friendship that makes unhappy marriages.",
    "That which does not kill us makes us stronger.",
    "I'm not upset that you lied to me, I'm upset that from now on I can't believe you.",
    "And those who were seen dancing were thought to be insane by those who could not hear the music.",
    "It is hard enough to remember my opinions, without also remembering my reasons for them!"
]

In [None]:
for q in quotes:
    seq = q[:40].lower()
    print(seq)
    print(predict_completions(seq, 3))
    print()