# Compose: Training a model to generate text

In [2]:
import os
import pickle
import numpy as np
#from music21 import note, chord

from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import plot_model
from keras.utils import np_utils
from keras.layers import LSTM, Input, Dropout, Dense, Activation, Embedding, Concatenate, Reshape
from keras.layers import Flatten, RepeatVector, Permute, TimeDistributed
from keras.layers import Multiply, Lambda, Softmax
import keras.backend as K 
from keras.models import Model
from keras.optimizers import RMSprop

## Set parameters

In [3]:
# run params
section = 'composetxt'
run_id = '0001'
txt_name = 'txtattn'

run_folder = 'run/{}/'.format(section)
run_folder += '_'.join([run_id, txt_name])

store_folder = os.path.join(run_folder, 'store')
data_folder = os.path.join('data', txt_name)

if not os.path.exists(run_folder):
    os.mkdir(run_folder)
    os.mkdir(os.path.join(run_folder, 'store'))
    os.mkdir(os.path.join(run_folder, 'output'))
    os.mkdir(os.path.join(run_folder, 'weights'))
    os.mkdir(os.path.join(run_folder, 'viz'))

mode = 'build' # 'load' # 

# data params
intervals = range(1)
seq_len = 32

# model params
embed_size = 100
rnn_units = 256
use_attention = True

## Extract the text

In [4]:
import re

token_type = 'word'

#load in the text and perform some cleanup

seq_length = 20

filename = "./data/aesop/data.txt"

with open(filename, encoding='utf-8-sig') as f:
    text = f.read()    
    
#removing text before and after the main stories
start = text.find("THE FOX AND THE GRAPES\n\n\n")
end = text.find("ILLUSTRATIONS\n\n\n[")
text = text[start:end]

start_story = '| ' * seq_length
    
text = start_story + text
text = text.lower()
text = text.replace('\n\n\n\n\n', start_story)
text = text.replace('\n', ' ')
text = re.sub('  +', '. ', text).strip()
text = text.replace('..', '.')

text = re.sub('([!"#$%&()*+,-./:;<=>?@[\]^_`{|}~])', r' \1 ', text)
text = re.sub('\s{2,}', ' ', text)

len(text)
print(text[:80])

 | | | | | | | | | | | | | | | | | | | | the fox and the grapes . a hungry fox s


In [13]:
from keras.preprocessing.text import Tokenizer

if token_type == 'word':
    tokenizer = Tokenizer(char_level = False, filters = '')
else:
    tokenizer = Tokenizer(char_level = True, filters = '', lower = False)    
    
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1
token_list = tokenizer.texts_to_sequences([text])[0] # tokenized text

print(total_words)

print(str(tokenizer.word_index)[0:50]) # vocabulary {'|': 1, ',': 2, 'the': 3, 'and': 4, '.': 5, 'a': 
print(token_list[:50]) # tokenized text [1, 1, 1, 1, 3, 56, 4, 3, 940,

4170
{'|': 1, ',': 2, 'the': 3, 'and': 4, '.': 5, 'a': 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 56, 4, 3, 940, 5, 6, 382, 56, 94, 77, 216, 1557, 9, 940, 941, 62, 6, 581, 20, 12, 2226, 162, 6, 359, 2227, 2, 4, 158, 11]


In [4]:
# gensim text load, preprocessing, tokenization
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora

def read_in_chunks(infile, chunk_size=1024*64):
    chunk = infile.read(chunk_size)
    while chunk:
        yield chunk
        chunk = infile.read(chunk_size)
        
def process_data(chunk, text):
    text.append(str(chunk)) # 'utf8' codec can't decode byte 0xc3
    
seq_len = 20
        
filename = "./data/lgtst/Proktols of Neptune.txt"
        
f = open(filename, encoding="utf-8")
text = []
for piece in read_in_chunks(f):
    process_data(piece, text)
    
corpus = preprocess_string(' '.join(text))
dct = corpora.Dictionary([corpus])  # initialize a Dictionary
token_list = [dct.doc2idx(cdoc) for cdoc in [corpus]]

fout = open(filename + 'dct.txt', 'wb') # to be used later by the generator
dct.save_as_text(fout)
fout.close()

## Prepare network I/O

In [5]:
def prepare_sequences(text, n_tokens, seq_len = 32):
    """ Prepare the sequences used to train the Neural Network """

    text_network_input = []
    text_network_output = []

    # create input sequences and the corresponding outputs
    for i in range(len(text) - seq_len): 
        text_network_input.append(text[i:i + seq_len])
        text_network_output.append(text[i + seq_len])

    n_patterns = len(text_network_input)
    # reshape the input into a format compatible with LSTM layers
    text_network_input = np.reshape(text_network_input, (n_patterns, seq_len))
    network_input = [text_network_input]

    text_network_output = np_utils.to_categorical(text_network_output, num_classes=n_tokens)
    network_output = [text_network_output]

    return (network_input, network_output)

In [6]:
#network_input, network_output = prepare_sequences(notes, durations, lookups, distincts, seq_len)
network_input, network_output = prepare_sequences(token_list[0], len(dct)+1, seq_len)

In [7]:
print('text input')
print(network_input[0])

print('text output')
print(network_output[0])

text input
[[ 940  830  605 ...  983  427 1159]
 [ 830  605  588 ...  427 1159  165]
 [ 605  588 1159 ... 1159  165  680]
 ...
 [1239  757 1287 ...  697 1155  886]
 [ 757 1287  684 ... 1155  886 1096]
 [1287  684 1445 ...  886 1096   60]]
text output
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Create the structure of the neural network

In [8]:
def create_network(n_tokens, embed_size = 100, rnn_units = 256, use_attention = False):
    """ create the structure of the neural network """

    text_in = Input(shape = (None,))

    x1 = Embedding(n_tokens, embed_size)(text_in) 

    #x = Concatenate()([x1,x2])
    x = x1 # todo: remove and change x1

    x = LSTM(rnn_units, return_sequences=True)(x)
    # x = Dropout(0.2)(x)

    if use_attention:

        x = LSTM(rnn_units, return_sequences=True)(x)
        # x = Dropout(0.2)(x)

        e = Dense(1, activation='tanh')(x)
        e = Reshape([-1])(e)
        alpha = Activation('softmax')(e)

        alpha_repeated = Permute([2, 1])(RepeatVector(rnn_units)(alpha)) # todo: check the 2, 1

        c = Multiply()([x, alpha_repeated])
        c = Lambda(lambda xin: K.sum(xin, axis=1), output_shape=(rnn_units,))(c)
    
    else:
        c = LSTM(rnn_units)(x)
        # c = Dropout(0.2)(c)
                                    
    text_out = Dense(n_tokens, activation = 'softmax', name = 'text')(c)
   
    model = Model(text_in, text_out)

    if use_attention:
        att_model = Model(text_in, alpha)
    else:
        att_model = None

    opti = RMSprop(lr = 0.001)
    model.compile(loss=['categorical_crossentropy'], optimizer=opti)

    return model, att_model

In [9]:
#model, att_model = create_network(n_tokens, embed_size, rnn_units, use_attention)
model, att_model = create_network(len(dct)+1, embed_size, rnn_units, use_attention)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    145100      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, None, 256)    365568      embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, None, 256)    525312      lstm_1[0][0]                     
____________________________________________________________________________________________

## Train the neural network

In [10]:
weights_folder = os.path.join(run_folder, 'weights')
# model.load_weights(os.path.join(weights_folder, "weights.h5"))

In [11]:
weights_folder = os.path.join(run_folder, 'weights')

checkpoint1 = ModelCheckpoint(
    os.path.join(weights_folder, "weights-improvement-{epoch:02d}-{loss:.4f}-bigger.h5"),
    monitor='loss',
    verbose=0,
    save_best_only=True,
    mode='min'
)

checkpoint2 = ModelCheckpoint(
    os.path.join(weights_folder, "weights.h5"),
    monitor='loss',
    verbose=0,
    save_best_only=True,
    mode='min'
)

early_stopping = EarlyStopping(
    monitor='loss'
    , restore_best_weights=True
    , patience = 10
)


callbacks_list = [
    checkpoint1
    , checkpoint2
    , early_stopping
 ]

model.save_weights(os.path.join(weights_folder, "weights.h5"))
model.fit(network_input, network_output
          , epochs=200, batch_size=32 # 200 epochs
          , validation_split = 0.2
          , callbacks=callbacks_list
          , shuffle=True
         )


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4006 samples, validate on 1002 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200

Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/20

Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.callbacks.History at 0x1a9d838ef60>