In [2]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation, Bidirectional
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
import keras_metrics
import sys
import io
import os

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from os import listdir
from os.path import isfile, join

Using TensorFlow backend.


In [None]:
cn_file = './data/numberbatch-en-17.06.txt'

cn_vectors = KeyedVectors.load_word2vec_format(cn_file, binary=False)

In [86]:
### Read/process inputs
input_path = './raw_data/'
all_data_sents = []

files = [join(input_path, f) for f in listdir(input_path) if isfile(join(input_path, f))]
for filename in files:
    with open(filename, encoding='utf-8', errors='ignore') as f:
        text = f.read().lower()
        text = text.replace('\n\n', '\n').replace('. ', ' . ')
        sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
        all_data_sents = all_data_sents + sentences
    
print(len(all_data_sents))

all_data_string = ' '.join(all_data_sents)

57593


In [87]:
# Parameters: change to experiment different configurations
SEQUENCE_LEN = 10
MIN_WORD_FREQUENCY = 10
STEP = 1
BATCH_SIZE = 32

In [88]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)

In [89]:
# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [90]:
# Version 1
def get_model(dropout=0.2):
    print('Building model...')
    model = Sequential()
    model.add(Embedding(input_dim=len(words), output_dim=1024))
    model.add(Bidirectional(LSTM(128)))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    print('Model built')
    return model

In [76]:
# Version 2
# def get_model(dropout=0.2):
#     print('Building model...')
#     model = Sequential()
#     model.add(Embedding(input_dim=len(words), output_dim=1024))
#     model.add(LSTM(512, return_sequences=True))
#     if dropout > 0:
#         model.add(Dropout(dropout))
#     model.add(LSTM(512, return_sequences=False))
#     if dropout > 0:
#         model.add(Dropout(dropout))
#     model.add(Dense(len(words)))
#     model.add(Activation('softmax'))
#     print('Model built')
#     return model

In [91]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [92]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [93]:
if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

# Real code starts here

In [94]:
text = all_data_string

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)
        
words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))  # v1 - 11925, v2 - 8816

Corpus length in words: 613246
Unique words before ignoring: 27749
Ignoring words with frequency < 10
Unique words after ignoring: 5060


In [95]:
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

In [96]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences = []
next_words = []
ignored = 0

for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored + 1

print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 375793
Remaining sequences: 237443


In [97]:
# x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 232694
Size of test set = 4749


In [98]:
model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

checkpoint_file_path = "./checkpoints/LSTM_GRRM-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
            "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % \
            (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)

Building model...
Model built


In [99]:
checkpoint = ModelCheckpoint(checkpoint_file_path, monitor='val_acc', save_best_only=False)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=20)

callbacks_list = [checkpoint, print_callback, early_stopping]

In [None]:
model.load_weights("./checkpoints/LSTM_LYRICS-epoch006-words11925-sequence10-minfreq10-loss5.3823-acc0.1647-val_loss5.4348-val_acc0.1620")

In [None]:
examples = './examples/examples_1'
examples_file = open(examples, "w")
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=20,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20