In [None]:
from theano.sandbox import cuda
cuda.use('gpu1')

In [None]:
%matplotlib inline
import utils; reload(utils)
from utils import *
from __future__ import division, print_function

In [None]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1 # why +1 here?
print('total chars:', vocab_size)

In [None]:
chars.insert(0, '\0') # what's the use of '\0'?

In [None]:
''.join(chars[1:-6]) # why -6 here? try the whole set

In [None]:
char2ind = dict((c, i) for i, c in enumerate(chars))
ind2char = dict((i, c) for i, c in enumerate(chars))

In [None]:
index = [char2ind[c] for c in text]

In [None]:
index[:10]

In [None]:
''.join(ind2char[i] for i in index)

## 3 char RNN

### Create inputs

In [None]:
cs=3
c1_dat = [idx[i] for i in xrange(0, len(idx)-1-cs, cs)]
c2_dat = [idx[i+1] for i in xrange(0, len(idx)-1-cs, cs)]
c3_dat = [idx[i+2] for i in xrange(0, len(idx)-1-cs, cs)]
c4_dat = [idx[i+3] for i in xrange(0, len(idx)-1-cs, cs)]

In [None]:
x1 = np.stack(c1_dat[:-2]) # why -2 here?
x2 = np.stack(c2_dat[:-2])
x3 = np.stack(c3_dat[:-2])
y = np.stack(c4_dat[:-2])

In [None]:
x1.shape, y.shape

In [None]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [None]:
c1_in, c1 = embedding_input('c1', vocab_size, 50)
c2_in, c2 = embedding_input('c2', vocab_size, 50)
c3_in, c3 = embedding_input('c3', vocab_size, 50)

### Create model

In [None]:
n_hidden = 256

In [None]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='tanh')
dense_out = Dense(vocab_size, activation='softmax')

In [None]:
c1_dense = dense_in(c1)
c2_dense = dense_in(c2)
c3_dense = dense_in(c3)
c1_hidden = dense_hidden(c1_dense)
c1_c2_merge = merge(c1_hidden, c2_dense)
c1_c2_hidden = dense_hidden(c1_c2_merge)
c1_c2_c3_merge = merge(c1_c2_hidden, c3_dense)
c_out = dense_out(c1_c2_c3_merge)

In [None]:
model = Model([c1_in, c2_in, c3_in], c_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
model.optimizer.lr = 0.0001

In [None]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=5)

In [None]:
model.optimizer.lr = 0.01

In [None]:
model.fit([x1, x2, x3], y, batch_size=64, nb_epoch=5)

### Test model

In [None]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs] # convert a char index to np array
    p = model.predict(arrs) # predict 4th char based on 3 chars
    i = np.argmax(p) # char index with highest prediction due to softmax output
    return chars[i]

In [None]:
get_next('lov')

In [None]:
get_next('fis')

In [None]:
get_next('sho')

## First RNN

In [None]:
cs = 8

In [None]:
c_in_dat = [[idx[i+j] for i in xrange(0, len(idx)-1-cs, cs)] for j in range(cs)]

In [None]:
c_out_dat = [idx[i+cs] for i in xrange(0, len(idx)-1-cs, cs)]

In [None]:
x_in = [np.stack(c[:-2]) for c in c_in_dat]

In [None]:
y = np.stack(c_out_dat[:-2])

In [None]:
n_fac = 50

In [None]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'emb')(inp)
    return inp, Flatten()(emb)

In [None]:
c_emb_inp = [embedding_input('c'+str(i), vocab_size, n_fac) for i in range(cs)]

In [None]:
n_hidden = 256

In [None]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='tanh')
dense_out = Dense(vocab_size, activation='softmax')

In [None]:
hidden = dense_in(c_emb_inp[0][1])

In [None]:
for i in range(1, cs):
    c_dense = dense_in(c_emb_inp[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge(c_dense, hidden)

In [None]:
c_dense_out = dense_out(hidden)

In [None]:
model = Model([c[0] for c in c_emb_inp], c_dense_out)

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
model.fit(x_in, y, batch_size=64, nb_epoch=10)

In [None]:
def get_next(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs] # convert a char index to np array
    p = model.predict(arrs) # predict 4th char based on 3 chars
    i = np.argmax(p) # char index with highest prediction due to softmax output
    return chars[i]

In [None]:
get_next('get the ')

In [None]:
get_next('tell me ')

In [None]:
get_next('sing a s')

## Keras RNN

In [None]:
n_hidden, n_fac, cs, vocab_size = (256, 42, 8, 86)

In [None]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs),
    SimpleRNN(n_hidden, activation='relu', inner_init='identity'),
    Dense(vocab_size, activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
model.summary()

In [None]:
model.fit(x_in, y, batch_size=64, nb_epoch=10) # x input should be a np array

In [None]:
def get_next_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:]
    p = model.predict(arrs)[0] # what's [0] mean here?
    return chars[np.argmax(p)]

## Sequential model

each c_in_dat array will output a c_out_dat array

In [None]:
c_out_dat = [[idx[i+j] for i in xrange(1, len(idx)-cs, cs)] for j in range(cs)]

In [None]:
y = [np.stack(c[:-2]) for c in c_out_dat]

In [None]:
dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='tanh')
dense_out = Dense(vocab_size, activation='softmax')

In [None]:
# use zeros as the initial input instead of the first input array
zeros_in = Input(shape=(n_fac,), name='zeros')
hidden = dense_in(zeros_in)

In [None]:
outs = []
for i in range(cs):
    c_dense = dense_in(c_emb_inp[i][1])
    hidden = dense_hidden(hidden)
    hidden = merge(c_dense, hidden)
    out.append(dense_out(hidden))

In [None]:
model = Model([zeros_in]+[c[0] for c in c_emb_inp], outs)

In [None]:
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam())

In [None]:
# check dimensions
zeros = np.tile(np.zeros(n_fac), (len(x_in[0]),1))
zeros.shape

In [None]:
model.fit([zeros]+x_in, y, batch_size=64, nb_epoch=10)

In [None]:
def get_nexts(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = [np.array(i)[np.newaxis] for i in idxs]
    # include zeros input
    p = model.predict([np.zeros(n_fac)[np.newaxis,:]] + arrs)
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

In [None]:
get_nexts('this is t')

In [None]:
get_nexts('I want th')

## Sequential model with Keras

In [None]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs),
    SimpleRNN(n_hidden, return_sequences=True, activation='relu', inner_init='identity'),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

In [None]:
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
# what's the meaning of this?
x_rnn=np.stack(np.squeeze(x_in), axis=1)
y_rnn=np.atleast_3d(np.stack(ys, axis=1))

In [None]:
x_rnn.shape, y_rnn.shape

In [None]:
model.fit(x_rnn, y_rnn, batch_size=64, nb_epoch=8)

In [None]:
def get_nexts_keras(inp):
    idxs = [char_indices[c] for c in inp]
    arrs = np.array(idxs)[np.newaxis,:] # why add a new axis
    p = model.predict(arrs)[0] # what's [0] mean here?
    print(list(inp))
    return [chars[np.argmax(o)] for o in p]

## Stateful model with Keras

In [None]:
batch_size=64

In [None]:
# batch_input_shape needed for stateful model. It is applied to the first layer.
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(batch_size, 8))
    BatchNormalization()
    LSTM(n_hidden, return_sequences=True, stateful=True)
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

In [None]:
model.summary()
# embedding should be [64, 8, 50]
# LSTM should be [64, 8, 256]
# TimeDistributed should be [64, 8, 86]

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
mx = len(x_rnn)//batch_size*batch_size
# do we necessarily need this? I thought the last batch of input can be smaller than batch_size.
# May be it is enforced by batch_input_shape? Try comment it out and see if error happens.

In [None]:
model.fit(x_rnn[:mx], y_rnn[:mx], nb_epoch=5, batch_size=batch_size, shuffle=False)
# shuffle=False because we set stateful=True which means we should keep the order of sequence intact.

In [None]:
model.optimizer.lr=1e-3

In [None]:
model.fit(x_rnn[:mx], y_rnn[:mx], nb_epoch=5, batch_size=64, shuffle=False)

#### 2 LSTM layers

In [None]:
model = Sequence([
    Embedding(vocab_size, n_fac, input_length=cs, batch_input_shape=(64, 8))
    BatchNormalization()
    LSTM(256, return_sequences=True, stateful=True)
    LSTM(256, return_sequences=True, stateful=True)
    TimeDistributed(Dense(256, activation='relu'))
    Dropout(0.5)
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

In [None]:
pmodel = Sequence([
    Embedding(vocab_size, n_fac, input_length=1, batch_input_shape=(1, 1))
    BatchNormalization()
    LSTM(256, return_sequences=True, stateful=True)
    LSTM(256, return_sequences=True, stateful=True)
    TimeDistributed(Dense(256, activation='relu'))
    Dropout(0.5)
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

In [None]:
model.summary()

In [None]:
pmodel.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [None]:
def run_epochs_with_reset(n):
    for i in range(n):
        model.reset_states()
        h = model.fit(x_rnn[:mx], y_rnn[:mx], nb_epoch=1, batch_size=64, shuffle=False)
        print(h.history['loss'])

In [None]:
def print_example(ln=160):
    for l1, l2 in zip(model.layers, pmodel.layers):
        l2.set_weights(l1.get_weights())
    pmodel.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())
    seed_string = text[:ln//4] # what's the meaning?
    pmodel.reset_states()
    
    # what's the meaning of this loop? How the two loops work
    for s in seed_string:
        x = np.array([char2ind[s]])[np.newaxis, :]
        preds = pmodel.predict(x, verbose=0)[0][0]
    s = choice(chars, p=preds)
    
    res = seed_string + s + '...\n'
    for i in range(ln):
        x = np.array([char2ind[s]])[np.newaxis, :]
        preds = pmodel.predict(x, verbose=0)[0][0]
        preds = preds // np.sum(preds)
        s = choice(chars, p=preds)
        res = res + s
        
    print(res)

In [None]:
run_epochs_with_reset(1)

In [None]:
print_example()