In [1]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout ,LSTM
from keras.optimizers import RMSprop
import sys
import os
import random
import numpy as np

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 980M (CNMeM is disabled, cuDNN 5110)


In [2]:

def parse_folder(folder,duplication =2):
    levels = []
    for file in os.listdir(folder):
        with open(os.path.join(folder,file),'rb') as infile:
            level = [list(line.rstrip()) for line in infile]
            levels.append(level)
    duplication = 2
    for ii in range(duplication):
        levels = levels + levels
    random.shuffle(levels)

    outstr = ''
    for level in levels:
        width = len(level[0])
        height = len(level)
        outstr += '\n'
        for column in range(width):
            outstr += '('
            for row in range(height):
                outstr += level[row][column]
    return outstr




text = parse_folder('trainingData/')

In [3]:
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))

#Make vocabularies
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

corpus length: 175440
total chars: 15


In [4]:
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 58467


At this point we've read in the text, found out the size of our vocabulary, and split it into semi-redundant sequences.  No we encode it as a 1-hot encoding.

This means that if before it looked like:

    
    -X-X
    
    
and we have the vocab `{'-':0, 'X':1, 'S':2}`

it will now look like:

    [[1,0,0],[0,1,0],[1,0,0],[0,1,0]]
    
i.e.  the index of the character in the vocab is set to 1 and everything else is set to 0


In [5]:
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    
print(X.shape)
print(y.shape)

(58467, 40, 15)
(58467, 15)


We create two matrices, one of size:
    # of sequences X index in sequence X size of vocab
    
and one of size:
    # of sequences X size of vocab
    
The first is the sequence data in one-hot encoding, and the second is what we are predicting, i.e. the next character in the sequence after the preceding sequence.


At this point, we're going to create our Neural Network

In [6]:
size = 128
layers = 2
dropout = 0.5

    size = # of LSTM cells per layer in the neural network
    layers = # of layers of LSTM cells
    dropout = % of cells to dropout at each training instance 
    
It tends to be a bit of blackart in determining what the proper tuning for the parameters are. Generally, you can assume that bigger is better, and deeper is better, but the balance between the two is up in the air.  It's easier to go deeper than wider, since while a 256 x 2 network has the same number of cells as a 128 x 4 network, it has ~4/3 the number of parameters (~256^2 vs ~128^2 * 3), but there are diminishing returns in both.  

Dropout randomly turns off a % of cells for each training instance, which acts as a form of regularization that prevents the network from overfitting.  The reason for this is that instead of specific cells becoming overly attuned, it creates exponentially many sub-networks that must all try to learn the same things in different ways.  Increasing dropout increases training time so it's best to start small, if a divergence between training and validation error appears, increase the dropout and start again.  

In [7]:
model = Sequential()
#INPUT
model.add(LSTM(size, input_shape=(maxlen, len(chars)),return_sequences=True))
model.add(Dropout(dropout))
#MIDDLE LAYERS
for ii in range(layers-2):
    model.add(LSTM(size, input_shape=(maxlen,size),return_sequences=True))
    model.add(Dropout(dropout))
#OUTPUT
model.add(LSTM(size, input_shape=(maxlen, len(chars)),return_sequences=False))
model.add(Dropout(dropout))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

The model construction is broken into 3 sections:
#### INPUT
Since this is the first layer in our network, we have to specify the input dimensions (coming from the dimensions of our X data above). 
#### MIDDLE
Here is where we construct an arbitrary number of LSTM layers.  Each of these returns a sequence of vectors (as the layers increase we can think of them learning a hierarchy of sequences)
#### OUTPUT
Our final LSTM layer doesn't output a sequence and instead outputs a single vector.  This can be thought of as a distillation of the previous sequence into one piece of information.  This is then fed into a Densely connected layer the size of our vocabulary.  The output of this Dense layer has a softmax activation which is defined as:

$\sigma (\mathbf {z} )_{j}={\frac {e^{z_{j}}}{\sum _{k=1}^{K}e^{z_{k}}}}$    for j = 1, …, K.

i.e. we exponentiate each output of the Dense layer and then divide by the sum of those exponentiations.

Why?  

By exponentiating we guarantee that each value > 0.  By dividing by the sum, we guarantee that everything sums to 1.  These are the things we need for discrete probability distribution.  In essence we've now distilled our sequence into a probability distribution over the next character in the sequence given the preceding sequence i.e.

$Pr(c_i | c_{i-1},c_{i-2}, ..., c_{i-N})$


Finally we need to compile our model.

To do so we need 3 things:

##### learning rate
The rate at which the backpropagation of the error gradient occurs
##### optimization technique 
Stochastic Gradient Descent (SGD) is the core of all the techniques, but there are number of improved techniques.  RMSprop is the de facto choice for Recurrent networks.
##### loss criterion
Neural networks are just big functions.  SGD + Back Propagation is how we update the parameters of the function, but to do so we need to know how to update our function.  To know how to update we need to know how wrong we are (or how right we are), for categorical distributions this works out to the categorical cross entropy which is defined as:

$H(p,q)=-\sum _{x}p(x)\,\log q(x)$

Where $p(x)$ is the probability associated with the truth (e.g. 1 for the character, 0 for everything else) and $q(x)$ is the predicted probability.  We could instead just say that our loss is 1 if we get it wrong and 0 if we get it right, but this doesn't reward how confident we are in our predictions, hence why we use $q(x)$ instead of just the predicted character.  

NOTE: This is sometimes incorrectly labeled as Softmax loss (it is always coupled with a Softmax activation, but Softmax is the activation and cross entropy is the loss). 

NOTE: I use the phrase loss criterion, but you will see it called just loss, just criterion, or objective in different places.  These all mean the same thing.



In [8]:

learning_rate = 0.005

optimizer = RMSprop(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=['accuracy'])


Now we train!

In [9]:

# helper function to sample an index from a probability array
def sample(preds, temperature=1.0):
    if temperature == 0.0:
        return np.argmax(preds)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 50):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=256, epochs=1, validation_split=0.1)
    start_index = 0 
    if iteration > 3:
        for diversity in [0, 1.0]:
            print()
            print('----- diversity:', diversity)

            generated = ''
            sentence = text[start_index: start_index + maxlen]
            generated += sentence
            print('----- Generating with seed: "' + sentence + '"')
            

            for i in range(300-maxlen+1):
                x = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sentence):
                    x[0, t, char_indices[char]] = 1.

                preds = model.predict(x, verbose=0)[0]
                next_index = sample(preds, diversity)
                next_char = indices_char[next_index]

                generated += next_char
                sentence = sentence[1:] + next_char

            columns = generated.split('(')[1:]

            level = [['' for c in columns] for r in columns[0]]

            for col_index,column in enumerate(columns):

                for row_index, tile in enumerate(column):
                    if row_index < len(level) and col_index < len(level[0]):
                        level[row_index][col_index] = tile
            print('\n'.join([''.join([tile for tile in row]) for row in level]))



--------------------------------------------------
Iteration 1




Train on 52620 samples, validate on 5847 samples
Epoch 1/1

--------------------------------------------------
Iteration 2
Train on 52620 samples, validate on 5847 samples
Epoch 1/1

--------------------------------------------------
Iteration 3
Train on 52620 samples, validate on 5847 samples
Epoch 1/1

--------------------------------------------------
Iteration 4
Train on 52620 samples, validate on 5847 samples
Epoch 1/1

----- diversity: 0
----- Generating with seed: "
(-------------X(-------------X(--------"
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
--------------------
XXXXXXXXXXXXXXXXXXXX

----- diversity: 1.0
----- Generating with seed: "
(-------------X(-------------X(--------"
--------------------
--------------------
--------------------
--------------------
----------

KeyboardInterrupt: 