In [1]:
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation, Bidirectional
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
import sys
import io
import os

import numpy as np

import re

from os import listdir
from os.path import isfile, join

Using TensorFlow backend.


In [2]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[”“’‘'-()\"#/@;:<>{}`+=~|!?]", "", text)
    
    return text

In [3]:
### Read/process inputs
input_path = './raw_data/'
all_data_sents = []

files = [join(input_path, f) for f in listdir(input_path) if isfile(join(input_path, f))]
for filename in files:
    if filename.endswith('.txt'):
        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            text = text.replace('\n\n', '\n')\
                .replace('"', '')\
                .replace("\"", '')\
                .replace("...", "")\
                .replace("…", "")\
                .replace("—", "")\
                .replace(". ", " . ")\
                .replace(", ", " , ")
            text = ' '.join(text.split("\n"))
#             sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
            all_data_sents.append(text)
    
print(len(all_data_sents))

all_data_string = ' '.join(all_data_sents)
print(len(all_data_string))

# all_data_string = clean_text(all_data_string)
# print(len(all_data_string))

5
9597727


In [4]:
maxlen = 50
step = 3

In [5]:
BATCH_SIZE = 128

In [6]:
text = all_data_string

print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 9597727
total chars: 57


In [7]:
# cut the text in semi-redundant sequences of maxlen characters
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 3199226


In [8]:
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


print('Finished')

Vectorization...


In [11]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(256, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

# optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='adam')

Build model...


In [12]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
# def sample(preds, temperature=None):
#     if temperature is None:
#         return np.argmax(preds)
#     else:
#         # helper function to sample an index from a probability array
#         preds = np.asarray(preds).astype('float64')
#         preds = np.log(preds) / temperature
#         exp_preds = np.exp(preds)
#         preds = exp_preds / np.sum(exp_preds)
#         probas = np.random.multinomial(1, preds, 1)
#         return np.argmax(probas)

In [13]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        examples_file.write('----- Diversity:' + str(diversity) + '\n')

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            examples_file.write(next_char)
            examples_file.flush()
        examples_file.write('\n')

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [15]:
checkpoint_file_path = "./checkpoints/LSTM_CHAR_GRRM-epoch{epoch:03d}-" \
                       "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}"

In [16]:
checkpoint = ModelCheckpoint(checkpoint_file_path, monitor='val_acc', save_best_only=False)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

callbacks_list = [checkpoint, print_callback]

In [17]:
examples = './examples/examples_char'
examples_file = open(examples, "w")

In [18]:
model.fit(x, y,
          batch_size=BATCH_SIZE,
          epochs=60,
          callbacks=callbacks_list)

Instructions for updating:
Use tf.cast instead.
Epoch 1/60
   5760/3199226 [..............................] - ETA: 1:35:54 - loss: 3.1513

KeyboardInterrupt: 