In [95]:
import numpy as np
import numpy.random as rnd
import theano
import theano.tensor as T
import lasagne
import time

CROSS VALIDATION FOR HYPERPARAMETERS

In [149]:
LEARNING_RATE = 0.001
GRAD_CLIP = 10
BATCH_SIZE = 1
NUM_EPOCHS = 10
N_HIDDEN = 100
TRAIN_SIZE = 10#int(1e5)
TEST_SIZE = int(1e4)
SEQ_LENS = [150, 200, 300, 400]

In [60]:
def gen_add_data(length, n_batch=BATCH_SIZE):
    X = np.concatenate([np.random.uniform(size=(n_batch, length, 1)),
                        np.zeros((n_batch, length, 1))],
                        axis=-1)
    y = np.zeros((n_batch,))
    
    X[np.arange(n_batch), np.random.randint(length/10, size=n_batch), 1] = 1
    X[np.arange(n_batch), np.random.randint(length/2, length, size=n_batch), 1] = 1
    # Multiply and sum the dimensions of X to get the target value
    y = np.sum(X[:, :, 0] * X[:, :, 1], axis=1)
    
    # Center the inputs and outputs ?
    #X -= X.reshape(-1, 2).mean(axis=0)
    #y -= y.mean()
    return (X.astype(theano.config.floatX), y.astype(theano.config.floatX),)

In [None]:
x, y = gen_add_data(10, 2)
y

In [5]:
train_data = [gen_add_data(T, TRAIN_SIZE) for T in SEQ_LENS]

In [6]:
test_data = [gen_add_data(T, TEST_SIZE) for T in SEQ_LENS]

In [61]:
# Вспомогательная функция для запаковки результата обучения 
def pack(train_err, test_err, network, inp, target, train_fn, test_fn):
    return {'train_err':train_err,
        'test_err':test_err,
        'network':network,
        'inp':inp,
        'target':target,
        'train_fn':train_fn, 
        'test_fn':test_fn
        } 

In [62]:
def init_posdef_w():
    R = np.random.normal(size=(N_HIDDEN, N_HIDDEN))
    A = 1 / N_HIDDEN * np.dot(R.T, R)
    eig, _ = np.linalg.eig(A + np.eye(N_HIDDEN))
    e = max(eig)
    W = (A + np.eye(N_HIDDEN)) / e
    return W

In [153]:
def build_paper_network(inp, seq_len, num_epochs=NUM_EPOCHS):
    # First, we build the network, starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, max sequence length, number of features)
    l_in = lasagne.layers.InputLayer(shape=(None, seq_len, 2), input_var=inp)

    alpha = np.sqrt(2) * np.exp(1.2 / (max(N_HIDDEN, 6)))
    l_rnn = lasagne.layers.RecurrentLayer(
        l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,
        W_in_to_hid=lasagne.init.Normal(std=alpha / N_HIDDEN, mean=0.0),
        W_hid_to_hid=init_posdef_w(),
        learn_init=True,
        only_return_final=True,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Our output layer is a simple dense connection, with 1 output unit
    l_out = lasagne.layers.DenseLayer(l_rnn, num_units=1,
                                      W=lasagne.init.GlorotNormal(),
                                      nonlinearity=lasagne.nonlinearities.rectify)
    return l_out

In [None]:
def build_network(inp, seq_len, num_epochs=NUM_EPOCHS):
    # First, we build the network, starting with an input layer
    # Recurrent layers expect input of shape
    # (batch size, max sequence length, number of features)
    l_in = lasagne.layers.InputLayer(shape=(None, seq_len, 2), input_var=inp)

    l_rnn = lasagne.layers.RecurrentLayer(
        l_in, N_HIDDEN, grad_clipping=GRAD_CLIP,
        W_in_to_hid=lasagne.init.GlorotNormal(),
        W_hid_to_hid=lasagne.init.GlorotNormal(),
        learn_init=True,
        only_return_final=True,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Our output layer is a simple dense connection, with 1 output unit
    l_out = lasagne.layers.DenseLayer(l_rnn, num_units=1,
                                      W=lasagne.init.GlorotNormal(),
                                      nonlinearity=lasagne.nonlinearities.rectify)
    return l_out

In [150]:
def train(seq_len, Xtrain, ytrain, Xtest, ytest, num_epochs=NUM_EPOCHS):
    print("Building network ...")
    inp = T.tensor3('input', dtype='float64')
    target_values = T.vector('target_output', dtype='float64')
    network = build_network(inp, seq_len)
    print("The network has {} params".format(lasagne.layers.count_params(network)))
    
    train_err = np.zeros(num_epochs)
    test_err = np.zeros(num_epochs)
    
    # lasagne.layers.get_output produces a variable for the output of the net
    network_output = lasagne.layers.get_output(network)
    # The value we care about is the final value produced for each sequence
    # first dim of predicted_values is BATCH_SIZE
    predicted_values = network_output[:, -1]
    # Our cost will be mean-squared error
    cost = T.mean((predicted_values - target_values)**2)
    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(network)
    # Compute SGD updates for training
    print("Computing updates ...")
    all_grads = T.grad(cost, all_params)
    scaled_grads = lasagne.updates.total_norm_constraint(all_grads, GRAD_CLIP)
    updates = lasagne.updates.sgd(scaled_grads, all_params, LEARNING_RATE)
    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train_fn = theano.function([inp, target_values], cost, updates=updates, allow_input_downcast=True)
    compute_cost = theano.function([inp, target_values], cost, allow_input_downcast=True)

    for epoch in range(num_epochs):
        start_time = time.time()
        for batch in range(TRAIN_SIZE):
            idx = np.random.randint(TRAIN_SIZE, size=BATCH_SIZE)
            train_err[epoch] += train_fn(Xtrain[idx, :, :], ytrain[idx])
        train_err[epoch] /= TRAIN_SIZE
        test_err[epoch] = compute_cost(Xtest, ytest)
        print("Epoch {} test loss = {:.4f} \t train = {:.4f} \t time = {:.2f}s".format(
                epoch, test_err[epoch], train_err[epoch], time.time() - start_time))
    return pack(train_err, test_err, network, inp, target_values, train_fn, compute_cost)

In [152]:
train(SEQ_LENS[0], train_data[0][0], train_data[0][1],
               test_data[0][0], test_data[0][1], 20)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 1.1685 	 train = 0.7130 	 time = 24.18s
Epoch 1 test loss = 1.1685 	 train = 0.8389 	 time = 21.60s
Epoch 2 test loss = 1.1685 	 train = 0.6875 	 time = 24.02s
Epoch 3 test loss = 1.1685 	 train = 0.7662 	 time = 23.74s
Epoch 4 test loss = 1.1685 	 train = 0.8242 	 time = 22.49s
Epoch 5 test loss = 1.1685 	 train = 0.7327 	 time = 22.30s
Epoch 6 test loss = 1.1685 	 train = 0.8922 	 time = 22.37s
Epoch 7 test loss = 1.1685 	 train = 0.6688 	 time = 22.87s
Epoch 8 test loss = 1.1685 	 train = 1.1502 	 time = 25.44s
Epoch 9 test loss = 1.1685 	 train = 0.7427 	 time = 23.64s


IndexError: index 10 is out of bounds for axis 0 with size 10

In [148]:
def f(x):
    return - x**4 / 4 + 4 * x**3 / 3 - 5*x**2 / 2 + 2*x

f(2) - f(1) + 1 / 12

0.16666666666666613

In [27]:
model1 = train(SEQ_LENS[0], train_data[0][0], train_data[0][1],
               test_data[0][0], test_data[0][1], NUM_EPOCHS)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.1612 	 train = 0.1691 	 time = 859.80s
Epoch 1 test loss = 0.1615 	 train = 0.1673 	 time = 906.57s
Epoch 2 test loss = 0.1612 	 train = 0.1672 	 time = 873.38s
Epoch 3 test loss = 0.1620 	 train = 0.1657 	 time = 815.92s
Epoch 4 test loss = 0.1615 	 train = 0.1660 	 time = 816.37s
Epoch 5 test loss = 0.1611 	 train = 0.1667 	 time = 810.19s
Epoch 6 test loss = 0.1609 	 train = 0.1670 	 time = 822.93s
Epoch 7 test loss = 0.1605 	 train = 0.1652 	 time = 825.71s
Epoch 8 test loss = 0.1587 	 train = 0.1624 	 time = 897.41s
Epoch 9 test loss = 0.0586 	 train = 0.1940 	 time = 813.96s


NameError: name 'target' is not defined

In [50]:
model1 = train(SEQ_LENS[0], train_data[0][0], train_data[0][1],
               test_data[0][0], test_data[0][1], NUM_EPOCHS)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.1622 	 train = 0.1721 	 time = 806.53s
Epoch 1 test loss = 0.1612 	 train = 0.1673 	 time = 786.58s
Epoch 2 test loss = 0.1612 	 train = 0.1671 	 time = 838.45s
Epoch 3 test loss = 0.1612 	 train = 0.1654 	 time = 812.91s
Epoch 4 test loss = 0.1624 	 train = 0.1668 	 time = 810.96s
Epoch 5 test loss = 0.1614 	 train = 0.1673 	 time = 784.09s
Epoch 6 test loss = 0.1611 	 train = 0.1665 	 time = 821.50s
Epoch 7 test loss = 0.1610 	 train = 0.1664 	 time = 840.86s
Epoch 8 test loss = 0.1603 	 train = 0.1662 	 time = 887.44s
Epoch 9 test loss = 0.1601 	 train = 0.1658 	 time = 923.61s
Epoch 10 test loss = 0.0581 	 train = 0.1693 	 time = 832.91s


In [None]:
model2 = train(SEQ_LENS[1], train_data[1][0], train_data[1][1],
               test_data[1][0], test_data[1][1], NUM_EPOCHS)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.1670 	 train = 0.1695 	 time = 1114.95s
Epoch 1 test loss = 0.1668 	 train = 0.1682 	 time = 1072.94s
Epoch 2 test loss = 0.1670 	 train = 0.1669 	 time = 1061.33s
Epoch 3 test loss = 0.1668 	 train = 0.1675 	 time = 1060.87s


In [56]:
model2

{'inp': input,
 'network': <lasagne.layers.dense.DenseLayer at 0x7f6082e4df60>,
 'target': target_output,
 'test_err': array([ 0.16702086,  0.1668479 ,  0.16698732,  0.16682679,  0.16716476,
         0.16684967,  0.16687859,  0.16677489,  0.16696768,  0.16674251,
         0.16654639]),
 'test_fn': <theano.compile.function_module.Function at 0x7f609966f898>,
 'train_err': array([ 0.16946059,  0.16821286,  0.16688193,  0.16753311,  0.16703759,
         0.16731607,  0.16840916,  0.16859257,  0.16727919,  0.16696799,
         0.16691616]),
 'train_fn': <theano.compile.function_module.Function at 0x7f6082422be0>}

In [None]:
model3 = train(SEQ_LENS[2], train_data[2][0], train_data[2][1],
               test_data[2][0], test_data[2][1], NUM_EPOCHS)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.1648 	 train = 0.1686 	 time = 1662.67s
Epoch 1 test loss = 0.1652 	 train = 0.1674 	 time = 1583.99s
Epoch 2 test loss = 0.1654 	 train = 0.1663 	 time = 1624.62s
Epoch 3 test loss = 0.1651 	 train = 0.1664 	 time = 1630.90s
Epoch 4 test loss = 0.1661 	 train = 0.1674 	 time = 1578.29s
Epoch 5 test loss = 0.1647 	 train = 0.1678 	 time = 1577.50s
Epoch 6 test loss = 0.1648 	 train = 0.1675 	 time = 1577.60s


In [72]:
model3

{'inp': input,
 'network': <lasagne.layers.dense.DenseLayer at 0x7f60832219b0>,
 'target': target_output,
 'test_err': array([ 0.16484366,  0.16524224,  0.16543644,  0.16513016,  0.1660609 ,
         0.16466655,  0.16480166,  0.16478171,  0.16472904,  0.16568135]),
 'test_fn': <theano.compile.function_module.Function at 0x7f60811eb0b8>,
 'train_err': array([ 0.16862747,  0.16742174,  0.1663266 ,  0.16641542,  0.16740696,
         0.16779978,  0.16745764,  0.16703502,  0.16680789,  0.16587701]),
 'train_fn': <theano.compile.function_module.Function at 0x7f6080ec60f0>}

In [73]:
model4 = train(SEQ_LENS[3], train_data[3][0], train_data[3][1],
               test_data[3][0], test_data[3][1], NUM_EPOCHS)

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.1726 	 train = 0.1671 	 time = 2137.14s
Epoch 1 test loss = 0.1665 	 train = 0.1673 	 time = 2124.30s
Epoch 2 test loss = 0.1679 	 train = 0.1668 	 time = 2058.24s
Epoch 3 test loss = 0.1665 	 train = 0.1665 	 time = 1986.15s
Epoch 4 test loss = 0.1665 	 train = 0.1663 	 time = 2030.84s
Epoch 5 test loss = 0.1664 	 train = 0.1656 	 time = 1991.12s
Epoch 6 test loss = 0.1664 	 train = 0.1657 	 time = 2029.58s
Epoch 7 test loss = 0.1664 	 train = 0.1646 	 time = 2009.58s
Epoch 8 test loss = 0.1666 	 train = 0.1653 	 time = 2039.72s
Epoch 9 test loss = 0.1668 	 train = 0.1647 	 time = 2035.09s


In [116]:
model1

{'inp': input,
 'network': <lasagne.layers.dense.DenseLayer at 0x7f60825df748>,
 'target': target_output,
 'test_err': array([ 0.16223222,  0.16124856,  0.16122107,  0.16119332,  0.16235495,
         0.16136186,  0.16106722,  0.161027  ,  0.16025838,  0.16010919,
         0.05806245]),
 'test_fn': <theano.compile.function_module.Function at 0x7f6081c71550>,
 'train_err': array([ 0.17208107,  0.16726611,  0.16708384,  0.16540762,  0.1668164 ,
         0.16730034,  0.16647114,  0.16644216,  0.16621783,  0.16580767,
         0.16927648]),
 'train_fn': <theano.compile.function_module.Function at 0x7f6081c5e518>}

In [80]:
# test_data[seq_len][x/y][idx]

In [87]:
count = 0
for i in range(TEST_SIZE):
    if model1["test_fn"](test_data[0][0][i:i+1], test_data[0][1][i:i+1]) < 0.0016:
        count += 1
print(count, count / TEST_SIZE)

1218 0.1218


In [88]:
count = 0
for i in range(TEST_SIZE):
    if model1["test_fn"](test_data[1][0][i:i+1], test_data[1][1][i:i+1]) < 0.0016:
        count += 1
print(count, count / TEST_SIZE)

1169 0.1169


In [89]:
count = 0
for i in range(TEST_SIZE):
    if model1["test_fn"](test_data[2][0][i:i+1], test_data[2][1][i:i+1]) < 0.0016:
        count += 1
print(count, count / TEST_SIZE)

1075 0.1075


In [90]:
count = 0
for i in range(TEST_SIZE):
    if model1["test_fn"](test_data[3][0][i:i+1], test_data[3][1][i:i+1]) < 0.0016:
        count += 1
print(count, count / TEST_SIZE)

970 0.097


In [141]:
def train_adam(seq_len, Xtrain, ytrain, Xtest, ytest, num_epochs=NUM_EPOCHS):
    print("Building network ...")
    inp = T.tensor3('input', dtype='float64')
    target_values = T.vector('target_output', dtype='float64')
    network = build_network(inp, seq_len)
    print("The network has {} params".format(lasagne.layers.count_params(network)))
    
    train_err = np.zeros(NUM_EPOCHS)
    test_err = np.zeros(NUM_EPOCHS)
    
    # lasagne.layers.get_output produces a variable for the output of the net
    network_output = lasagne.layers.get_output(network)
    # The value we care about is the final value produced for each sequence
    # first dim of predicted_values is BATCH_SIZE
    predicted_values = network_output
    # Our cost will be mean-squared error
    cost = T.mean((predicted_values.T - target_values)**2)
    # Retrieve all parameters from the network
    all_params = lasagne.layers.get_all_params(network)
    # Compute SGD updates for training
    print("Computing updates ...")
    all_grads = T.grad(cost, all_params)
    scaled_grads, norm = lasagne.updates.total_norm_constraint(all_grads, GRAD_CLIP, return_norm=True)
    updates = lasagne.updates.adam(scaled_grads, all_params)
    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train_fn = theano.function([inp, target_values], [cost, norm], updates=updates, allow_input_downcast=True)
    compute_cost = theano.function([inp, target_values], cost, allow_input_downcast=True)

    for epoch in range(NUM_EPOCHS):
        start_time = time.time()
        for batch in range(TRAIN_SIZE):
            idx = np.random.randint(TRAIN_SIZE, size=BATCH_SIZE)
            err, norm_tr = train_fn(Xtrain[idx, :, :], ytrain[idx])
            train_err[epoch] += err
        train_err[epoch] /= TRAIN_SIZE
        test_err[epoch] = compute_cost(Xtest, ytest)
        print("Epoch {} test loss = {:.4f} \t train = {:.4f} \t norm = {} \t time = {:.2f}s".format(
                epoch, test_err[epoch], train_err[epoch], norm_tr, time.time() - start_time))
    return pack(train_err, test_err, network, inp, target_values, train_fn, compute_cost)

In [142]:
TRAIN_SIZE = 10
BATCH_SIZE = 100

In [144]:
mod = train_adam(SEQ_LENS[0], train_data[0][0], train_data[0][1],
                 test_data[0][0], test_data[0][1])

Building network ...
The network has 10501 params
Computing updates ...
Compiling functions ...
Epoch 0 test loss = 0.5101 	 train = 0.6639 	 norm = 6.125143622326812 	 time = 32.57s
Epoch 1 test loss = 0.3488 	 train = 0.3001 	 norm = 3.800840105018453 	 time = 32.12s
Epoch 2 test loss = 0.1643 	 train = 0.0999 	 norm = 2.5677671656809027 	 time = 32.72s
Epoch 3 test loss = 0.2225 	 train = 0.0812 	 norm = 1.0763219145607437 	 time = 30.37s
Epoch 4 test loss = 0.1804 	 train = 0.0778 	 norm = 1.080205013349289 	 time = 31.14s
Epoch 5 test loss = 0.2011 	 train = 0.0731 	 norm = 0.25422163000423215 	 time = 31.69s
Epoch 6 test loss = 0.1817 	 train = 0.0756 	 norm = 0.3019141106830181 	 time = 31.45s
Epoch 7 test loss = 0.2058 	 train = 0.0746 	 norm = 0.21720167825755626 	 time = 30.35s
Epoch 8 test loss = 0.1937 	 train = 0.0733 	 norm = 0.7964197326509098 	 time = 31.89s
Epoch 9 test loss = 0.1958 	 train = 0.0727 	 norm = 0.650898958294212 	 time = 32.20s
