In [1]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation, Bidirectional
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
import sys
import io
import os

import numpy as np

import re

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from os import listdir
from os.path import isfile, join

Using TensorFlow backend.


In [2]:
cn_file = './data/numberbatch-en-17.06.txt'

cn_vectors = KeyedVectors.load_word2vec_format(cn_file, binary=False)

In [7]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''

    text = text.lower()
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[”“’‘'-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    
    return text

In [47]:
### Read/process inputs
input_path = './raw_data/'
all_data_sents = []

files = [join(input_path, f) for f in listdir(input_path) if isfile(join(input_path, f))]
for filename in files:
    if filename.endswith('.txt'):
        with open(filename, encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            text = text.replace('\n\n', '\n')\
                .replace('"', '')\
                .replace('-', ' - ')\
                .replace("\"", '')\
                .replace("...", "")\
                .replace("…", "")\
                .replace("—", "")
#                 .replace("'", "'")
            text = ' '.join(text.split("\n"))
            #.replace('. ', ' . ')
#             sentences = re.split(r' *[\.\?!][\'"\)\]]* *', text)
            all_data_sents.append(text)
    
print(len(all_data_sents))

all_data_string = ' '.join(all_data_sents)
print(len(all_data_string))

all_data_string = clean_text(all_data_string)
print(len(all_data_string))

5
9391948
9056654


In [48]:
# Parameters: change to experiment different configurations
SEQUENCE_LEN = 20
MIN_WORD_FREQUENCY = 2
STEP = 1
BATCH_SIZE = 32

In [49]:
text = all_data_string

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)
        
words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

Corpus length in words: 1763818
Unique words before ignoring: 24372
Ignoring words with frequency < 2
Unique words after ignoring: 17443


In [26]:
VOCAB_SIZE = len(words) + 1
EMBEDDING_DIM = 300

In [27]:
from keras.preprocessing.text import Tokenizer

def vocab_creater(text):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    dictionary = tokenizer.word_index
  
    word2idx = {}
    idx2word = {}
    for k, v in dictionary.items():
        if v < VOCAB_SIZE:
            word2idx[k] = v
            idx2word[v] = k
        if v >= VOCAB_SIZE:
            continue
          
    return word2idx, idx2word, tokenizer

word2idx, idx2word, tokenizer = vocab_creater(words)

In [28]:
def embedding_matrix_creater(word2idx):
    embedding_matrix = np.zeros((len(word2idx) + 1, EMBEDDING_DIM))
    for word, i in word2idx.items():
        try:
            embedding_vector = cn_vectors.get_vector(word)
        except:
            embedding_vector = None
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [29]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)

In [30]:
def embedding_layer_creater(VOCAB_SIZE, EMBEDDING_DIM, MAX_LEN, embedding_matrix):
  
    embedding_layer = Embedding(input_dim = VOCAB_SIZE, 
                                output_dim = EMBEDDING_DIM,
                                input_length = MAX_LEN,
                                weights = [embedding_matrix],
                                trainable = False)
    return embedding_layer

embedding_matrix = embedding_matrix_creater(word2idx)
embedding_layer = embedding_layer_creater(VOCAB_SIZE, EMBEDDING_DIM, SEQUENCE_LEN, embedding_matrix)

In [31]:
# Version 1
def get_model(dropout=0.2):
    print('Building model...')
    model = Sequential()
    model.add(embedding_layer)
#     model.add(Embedding(input_dim=len(words), output_dim=1024))
    model.add(Bidirectional(LSTM(256), input_shape=(SEQUENCE_LEN, 300)))
    if dropout > 0:
        model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    print('Model built')
    return model

In [None]:
# Version 2
# def get_model(dropout=0.2):
#     print('Building model...')
#     model = Sequential()
#     model.add(Embedding(input_dim=len(words), output_dim=1024))
#     model.add(LSTM(512, return_sequences=True))
#     if dropout > 0:
#         model.add(Dropout(dropout))
#     model.add(LSTM(512, return_sequences=False))
#     if dropout > 0:
#         model.add(Dropout(dropout))
#     model.add(Dense(len(words)))
#     model.add(Activation('softmax'))
#     print('Model built')
#     return model

In [32]:
# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [33]:
if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

# Real code starts here

In [36]:
# word_indices = dict((c, i) for i, c in enumerate(words))
# indices_word = dict((i, c) for i, c in enumerate(words))
word2idx
idx2word

{1: '1',
 2: '15th',
 3: '[on',
 4: 'a',
 5: 'aa',
 6: 'aback',
 7: 'abandon',
 8: 'abandoned',
 9: 'abandoning',
 10: 'abashed',
 11: 'abbatoir',
 12: 'abed',
 13: 'abel',
 14: 'abels',
 15: 'abhor',
 16: 'abide',
 17: 'abilities',
 18: 'ability',
 19: 'ablaze',
 20: 'able',
 21: 'ably',
 22: 'aboard',
 23: 'abolish',
 24: 'abolished',
 25: 'abominably',
 26: 'abomination',
 27: 'abominations',
 28: 'about',
 29: 'above',
 30: 'abovedecks',
 31: 'abreast',
 32: 'abroad',
 33: 'abrupt',
 34: 'abruptly',
 35: 'absence',
 36: 'absent',
 37: 'absently',
 38: 'absolute',
 39: 'absolutely',
 40: 'absolution',
 41: 'absolve',
 42: 'absorbed',
 43: 'absurd',
 44: 'absurdly',
 45: 'abyss',
 46: 'accent',
 47: 'accented',
 48: 'accents',
 49: 'accentuated',
 50: 'accept',
 51: 'acceptance',
 52: 'accepted',
 53: 'accepting',
 54: 'accepts',
 55: 'access',
 56: 'accident',
 57: 'acclaim',
 58: 'acclaimed',
 59: 'accommodate',
 60: 'accommodating',
 61: 'accommodations',
 62: 'accompanied',
 63: 

In [37]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.9, 1.0]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(50):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word2idx[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = idx2word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [40]:
# Embeddings - Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word2idx[w]
            y[i] = word2idx[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y

In [39]:
# NO EMBEDDINGS - Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN, len(words)), dtype=np.bool)
        y = np.zeros((batch_size, len(words)), dtype=np.bool)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t, word2idx[w]] = 1
            y[i, word2idx[next_word_list[index % len(sentence_list)]]] = 1
            index = index + 1
        yield x, y

In [41]:
# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences = []
next_words = []
ignored = 0

for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored + 1

print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

Ignored sequences: 151057
Remaining sequences: 1601086


In [42]:
# x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(sentences, next_words)

Shuffling sentences
Size of training set = 1569064
Size of test set = 32022


In [43]:
model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

checkpoint_file_path = "./checkpoints/LSTM_GRRM-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
            "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % \
            (len(words), SEQUENCE_LEN, MIN_WORD_FREQUENCY)

Building model...
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Model built


In [44]:
checkpoint = ModelCheckpoint(checkpoint_file_path, monitor='val_acc', save_best_only=False)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=20)

callbacks_list = [checkpoint, print_callback, early_stopping]

In [45]:
examples = './examples/examples_1'
examples_file = open(examples, "w")
model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=40,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

Instructions for updating:
Use tf.cast instead.
Epoch 1/40
  243/49034 [..............................] - ETA: 3:19:07 - loss: 7.3692 - acc: 0.0568

KeyboardInterrupt: 

In [None]:
model.load_weights("./checkpoints/LSTM_LYRICS-epoch006-words11925-sequence10-minfreq10-loss5.3823-acc0.1647-val_loss5.4348-val_acc0.1620")