In [1]:
import string
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import RMSprop
import random
import sys

## Load Text

In [2]:
# Save notepad as UTF-8 
filename = "../input/text-generation/236-0.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()  # open file in read mode
raw_text = raw_text[2096:]   # start at main sentence | character by character
print(raw_text[0:500])

## Data Pre-processing

In [3]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = "".join(v for v in txt if not v.isdigit())
    return txt 

raw_text = clean_text(raw_text)
print(raw_text[0:500])

In [4]:
# How many total characters and digit do we have 
chars = sorted(list(set(raw_text)))    # List of every character
chars   # n_vocab

In [5]:
# summarize the data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Corpus length: ", n_chars)   # Total Characters in the text
print("Total Vocab: ", n_vocab)

In [6]:
# Each unique character will be assigned an integer value
# Create a dictionary of characters mapped to integer values

char_to_int = dict((c, i) for i, c in enumerate(chars))
char_to_int

# character    Index
#     a         2

In [7]:
int_to_char = dict((i, c) for i, c in enumerate(chars))
int_to_char

# Index    Character
#  2          a

In [8]:
seq_length = 60     # Length of each input sequence
step = 10           # Instead of moving 1 letter at a time, try skipping a few 
sentences = []      # X values (Sentences)
next_chars = []     # Y values. The character that follows the sentence defined as X
for i in range(0, n_chars - seq_length, step):     # step=1 means each sentence is offset just by a single letter
    sentences.append(raw_text[i: i + seq_length])  # Sequence in
    next_chars.append(raw_text[i + seq_length])    # Sequence out
n_patterns = len(sentences)    
print('Number of sequences:', n_patterns)

In [9]:
print(sentences[0])
print("Output: ", next_chars[0])

# input: I love Cricke
# output: t

## Vectorization

In [10]:
# Rescale the integers to the range 0-to-1 
# reshape input to be [samples, time steps, features]
# time steps = sequence length
# features = numbers of characters in our vocab (n_vocab)

# Vectorize all sentences: there are n_patterns sentences
# For each sentence we have n_vocab characters available for seq_length
# Vectorization returns a vector for all sentences indicating the presence or absence of a character. 

x = np.zeros((len(sentences), seq_length, n_vocab), dtype=np.bool)
y = np.zeros((len(sentences), n_vocab), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_to_int[char]] = 1
    y[i, char_to_int[next_chars[i]]] = 1
    
print(x.shape)
print(y.shape)

print(y[0:3])

## Build LSTM model

In [11]:
mymodel = Sequential()
mymodel.add(LSTM(128, input_shape=(seq_length, n_vocab)))
mymodel.add(Dense(n_vocab, activation='softmax'))

optimizer = RMSprop(lr = 0.01)
mymodel.compile(loss = 'categorical_crossentropy', optimizer=optimizer)
mymodel.summary()

## Define the Checkpoint

In [12]:

from keras.callbacks import ModelCheckpoint

filepath="saved_weights/saved_weights-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')

callbacks_list = [checkpoint]

In [13]:
# Fit the model
mymodel.fit(x, y, batch_size=128, epochs=5, callbacks=callbacks_list)
mymodel.save('my_saved_weights_jungle_book_50epochs.h5')

## Generate characters function 

In [15]:
# We must provide a sequence of seq_lenth as input to start the generation process
# The prediction results is probabilities for each of the 32 characters at a specific & pick the one with max probability and print it out.

def sample(preds):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds) #exp of log (x), isn't this same as x??
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1) 
    return np.argmax(probas)

In [16]:
# Pick a random sentence from the text as seed(between n_chars)
start_index = random.randint(0, n_chars - seq_length - 1)
print(start_index)

# Our seed for prediction | Initiate generated text and keep adding new predictions and print them out
generated = ''
sentence = raw_text[start_index: start_index + seq_length]
generated += sentence
print(generated)

In [17]:
print('Seed for our text prediction: "' + sentence + '"')


for i in range(100):     # Number of characters including spaces
    x_pred = np.zeros((1, seq_length, n_vocab))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_to_int[char]] = 1.

    preds = mymodel.predict(x_pred, verbose=0)[0]
    next_index = sample(preds)
    next_char = int_to_char[next_index]

    generated += next_char
    sentence = sentence[1:] + next_char

    sys.stdout.write(next_char)
    sys.stdout.flush()
print()
