# Dynamic RNN ~Time doesn't fly as it does for others~
## Introduction
I have followed this guy's blog post, and modify a bit!  
https://iamtrask.github.io/2015/11/15/anyone-can-code-lstm/  

What is binary adding?  
https://www.khanacademy.org/math/algebra-home/alg-intro-to-algebra/algebra-alternate-number-bases/v/binary-addition  

numpy.unpackbits??  
https://docs.scipy.org/doc/numpy/reference/generated/numpy.unpackbits.html  

numpy.zeros_like?  
https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros_like.html

## Original one

In [1]:
import copy, numpy as np
np.random.seed(32)

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)

import time
start_time = time.time()

# binary dictionary, like {key : 1, value : binaryFormatOf(1)}
int2binary = {}
binary_dim = 8

largest_number = pow(2,binary_dim)

# np.unpackbits is converting the integer(np.unit8) into binary format
binary = np.unpackbits(np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
for i in range(largest_number):
    int2binary[i] = binary[i]

# input variables
alpha = 0.1
input_dim = 2
hidden_dim = 16
output_dim = 1


# initialize neural network weights
synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1
synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1

synapse_0_update = np.zeros_like(synapse_0)
synapse_1_update = np.zeros_like(synapse_1)
synapse_h_update = np.zeros_like(synapse_h)

# training logic
for j in range(100000):
    
    # generate a simple addition problem (a + b = c)
    a_int = np.random.randint(largest_number/2) # int version
    a = int2binary[a_int] # lookup binary dictionary

    b_int = np.random.randint(largest_number/2) # int version
    b = int2binary[b_int] # lookup binary dictionary

    # true answer
    c_int = a_int + b_int
    c = int2binary[c_int] # lookup binary dictionary
    
    # where we'll store our best guess (binary encoded)
    d = np.zeros_like(c)

    overallError = 0
    
    layer_2_deltas = list()
    layer_1_values = list()
    layer_1_values.append(np.zeros(hidden_dim))
    X_prev = np.array(np.zeros_like(a[0],b[0]))
    
    # moving along the positions in the binary encoding
    for position in range(binary_dim):
        
        # generate input and output
        X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]])
        y = np.array([[c[binary_dim - position - 1]]]).T

        # hidden layer (input ~+ prev_hidden)
        layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h))

        # output layer (new binary representation)
        layer_2 = sigmoid(np.dot(layer_1,synapse_1))

        # did we miss?... if so, by how much?
        layer_2_error = y - layer_2
        layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2))
        overallError += np.abs(layer_2_error[0])
    
        # decode estimate so we can print it out
        d[binary_dim - position - 1] = np.round(layer_2[0][0])
        
        # store hidden layer so we can use it in the next timestep
        layer_1_values.append(copy.deepcopy(layer_1))
    
    future_layer_1_delta = np.zeros(hidden_dim)
    
    for position in range(binary_dim):
        
        X = np.array([[a[position],b[position]]])
        
        layer_1 = layer_1_values[-position-1]
        prev_layer_1 = layer_1_values[-position-2]
        
        # error at output layer
        layer_2_delta = layer_2_deltas[-position-1]
        # error at hidden layer
        layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)

        # let's update all our weights so we can try again
        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
        synapse_0_update += X.T.dot(layer_1_delta)
        
        future_layer_1_delta = layer_1_delta
    

    synapse_0 += synapse_0_update * alpha
    synapse_1 += synapse_1_update * alpha
    synapse_h += synapse_h_update * alpha    

    synapse_0_update *= 0
    synapse_1_update *= 0
    synapse_h_update *= 0
    
    # print out progress
    if(j % 1000 == 0):
        print "Error:" + str(overallError)
        print "A  :",a
        print "B  :",b
        print "Pred:" + str(d)
        print "True:" + str(c)
        out = 0
        for index,x in enumerate(reversed(d)):
            out += x*pow(2,index)
        print str(a_int) + " + " + str(b_int) + " = " + str(out)
        print "------------"

print("--- %s seconds ---" % (time.time() - start_time))

Error:[5.32201378]
A  : [0 1 0 0 0 1 0 1]
B  : [0 1 1 1 1 0 1 0]
Pred:[0 0 0 0 0 0 0 0]
True:[1 0 1 1 1 1 1 1]
69 + 122 = 0
------------
Error:[4.08022623]
A  : [0 0 1 0 0 1 0 1]
B  : [0 1 0 0 1 1 1 0]
Pred:[1 0 0 1 1 1 1 1]
True:[0 1 1 1 0 0 1 1]
37 + 78 = 159
------------
Error:[3.86556323]
A  : [0 1 0 0 1 0 0 0]
B  : [0 1 0 1 0 0 0 1]
Pred:[1 0 1 0 0 0 1 1]
True:[1 0 0 1 1 0 0 1]
72 + 81 = 163
------------
Error:[3.52646301]
A  : [0 0 0 0 1 1 0 0]
B  : [0 0 1 1 1 0 1 1]
Pred:[0 0 1 0 0 1 1 1]
True:[0 1 0 0 0 1 1 1]
12 + 59 = 39
------------
Error:[2.0593875]
A  : [0 1 1 1 0 0 1 0]
B  : [0 1 1 0 1 1 0 0]
Pred:[1 1 0 1 1 1 1 0]
True:[1 1 0 1 1 1 1 0]
114 + 108 = 222
------------
Error:[1.64104928]
A  : [0 1 1 1 1 0 0 1]
B  : [0 1 0 0 1 0 1 1]
Pred:[1 1 0 0 0 1 0 0]
True:[1 1 0 0 0 1 0 0]
121 + 75 = 196
------------
Error:[0.98260838]
A  : [0 0 0 1 0 1 1 0]
B  : [0 1 1 1 1 1 0 1]
Pred:[1 0 0 1 0 0 1 1]
True:[1 0 0 1 0 0 1 1]
22 + 125 = 147
------------
Error:[0.6973367]
A  : [0 1 1 0 0

Error:[0.10384695]
A  : [0 1 1 0 1 0 0 1]
B  : [0 1 1 1 1 0 1 1]
Pred:[1 1 1 0 0 1 0 0]
True:[1 1 1 0 0 1 0 0]
105 + 123 = 228
------------
Error:[0.09132138]
A  : [0 1 0 1 0 0 1 1]
B  : [0 0 0 1 1 1 0 1]
Pred:[0 1 1 1 0 0 0 0]
True:[0 1 1 1 0 0 0 0]
83 + 29 = 112
------------
Error:[0.10150781]
A  : [0 1 0 0 1 1 0 1]
B  : [0 0 1 1 0 1 1 0]
Pred:[1 0 0 0 0 0 1 1]
True:[1 0 0 0 0 0 1 1]
77 + 54 = 131
------------
Error:[0.08884209]
A  : [0 1 0 1 1 0 1 0]
B  : [0 1 0 1 0 1 1 1]
Pred:[1 0 1 1 0 0 0 1]
True:[1 0 1 1 0 0 0 1]
90 + 87 = 177
------------
Error:[0.08843171]
A  : [0 0 0 0 1 0 1 1]
B  : [0 1 1 0 0 1 1 0]
Pred:[0 1 1 1 0 0 0 1]
True:[0 1 1 1 0 0 0 1]
11 + 102 = 113
------------
Error:[0.0791186]
A  : [0 1 1 0 1 1 1 1]
B  : [0 1 0 0 1 0 1 0]
Pred:[1 0 1 1 1 0 0 1]
True:[1 0 1 1 1 0 0 1]
111 + 74 = 185
------------
Error:[0.05544353]
A  : [0 1 0 0 0 1 0 1]
B  : [0 0 1 1 1 0 0 0]
Pred:[0 1 1 1 1 1 0 1]
True:[0 1 1 1 1 1 0 1]
69 + 56 = 125
------------
Error:[0.09515739]
A  : [0 0 0 

## Mine!

In [1]:
import copy, numpy as np
np.random.seed(32)

# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)

import time
start_time = time.time()

# binary dictionary, like {key : 1, value : binaryFormatOf(1)}
int2binary = {}
binary_dim = 8

largest_number = pow(2,binary_dim)

# np.unpackbits is converting the integer(np.unit8) into binary format
binary = np.unpackbits(np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
for i in range(largest_number):
    int2binary[i] = binary[i]

# define learning rate and network architecture manually
alpha = 0.1
input_dim = 2
hidden_dim = 16
output_dim = 1


# initialize neural network weights
synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1
synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1
synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1

synapse_0_update = np.zeros_like(synapse_0)
synapse_1_update = np.zeros_like(synapse_1)
synapse_h_update = np.zeros_like(synapse_h)

# training logic
for j in range(100000):
    
    # generate a simple addition problem (a + b = c)
    a_int = np.random.randint(largest_number/2) # int version
    a = int2binary[a_int] # lookup binary dictionary

    b_int = np.random.randint(largest_number/2) # int version
    b = int2binary[b_int] # lookup binary dictionary

    # true answer
    c_int = a_int + b_int
    c = int2binary[c_int] # lookup binary dictionary
    
    # where we'll store our prediction (binary encoded)
    d = np.zeros_like(c)

    overallError = 0
    # Network has only 3 layers
    layer_2_deltas = list()
    layer_1_values = list()
    layer_1_values.append(np.zeros(hidden_dim))
    X_prev = np.array(np.zeros_like(a[0],b[0]))
    
    # moving along the positions in the binary encoding
    for position in range(binary_dim):
        
        # generate input and output
        X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]])
        y = np.array([[c[binary_dim - position - 1]]]).T

        # hidden layer (input ~+ prev_hidden)
        norm = np.linalg.norm(X - X_prev)
        print(norm)
        # print(np.log(np.linalg.norm(norm)))
        layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h)*np.exp(norm))

        # output layer (new binary representation)
        layer_2 = sigmoid(np.dot(layer_1,synapse_1))

        # error caluculation
        layer_2_error = y - layer_2
        layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2))
        overallError += np.abs(layer_2_error[0])
    
        # decode estimate so we can print it out
        d[binary_dim - position - 1] = np.round(layer_2[0][0])
        
        # store hidden layer so we can use it in the next timestep
        layer_1_values.append(copy.deepcopy(layer_1))
        X_prev = X
    
    future_layer_1_delta = np.zeros(hidden_dim)
    
    for position in range(binary_dim):
        
        X = np.array([[a[position],b[position]]])
        layer_1 = layer_1_values[-position-1]
        prev_layer_1 = layer_1_values[-position-2]
        
        # error at output layer
        layer_2_delta = layer_2_deltas[-position-1]
        # error at hidden layer
        layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)

        # let's update all our weights so we can try again
        synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
        synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
        synapse_0_update += X.T.dot(layer_1_delta)
        
        future_layer_1_delta = layer_1_delta
    

    synapse_0 += synapse_0_update * alpha
    synapse_1 += synapse_1_update * alpha
    synapse_h += synapse_h_update * alpha    

    synapse_0_update *= 0
    synapse_1_update *= 0
    synapse_h_update *= 0
    
    # print out progress
    if(j % 1000 == 0):
        print "Error:" + str(overallError)
        print "A  :",a
        print "B  :",b
        print "Pred:" + str(d)
        print "True:" + str(c)
        out = 0
        for index,x in enumerate(reversed(d)):
            out += x*pow(2,index)
        print str(a_int) + " + " + str(b_int) + " = " + str(out)
        print "------------"

print("--- %s seconds ---" % (time.time() - start_time))

  


Error:[5.56330736]
A  : [0 1 0 0 0 1 0 1]
B  : [0 1 1 1 1 0 1 0]
Pred:[0 0 0 0 0 0 0 0]
True:[1 0 1 1 1 1 1 1]
69 + 122 = 0
------------
Error:[4.22467566]
A  : [0 0 1 0 0 1 0 1]
B  : [0 1 0 0 1 1 1 0]
Pred:[0 1 1 0 0 1 0 1]
True:[0 1 1 1 0 0 1 1]
37 + 78 = 101
------------
Error:[3.80473695]
A  : [0 1 0 0 1 0 0 0]
B  : [0 1 0 1 0 0 0 1]
Pred:[1 1 1 0 0 0 0 1]
True:[1 0 0 1 1 0 0 1]
72 + 81 = 225
------------
Error:[3.64756758]
A  : [0 0 0 0 1 1 0 0]
B  : [0 0 1 1 1 0 1 1]
Pred:[0 0 0 0 0 0 1 1]
True:[0 1 0 0 0 1 1 1]
12 + 59 = 3
------------
Error:[3.9018989]
A  : [0 1 1 1 0 0 1 0]
B  : [0 1 1 0 1 1 0 0]
Pred:[0 1 1 0 0 0 1 0]
True:[1 1 0 1 1 1 1 0]
114 + 108 = 98
------------
Error:[4.03699732]
A  : [0 1 1 1 1 0 0 1]
B  : [0 1 0 0 1 0 1 1]
Pred:[1 1 1 1 0 0 0 1]
True:[1 1 0 0 0 1 0 0]
121 + 75 = 241
------------
Error:[3.91530284]
A  : [0 0 0 1 0 1 1 0]
B  : [0 1 1 1 1 1 0 1]
Pred:[0 0 0 1 0 1 0 1]
True:[1 0 0 1 0 0 1 1]
22 + 125 = 21
------------
Error:[4.42656177]
A  : [0 1 1 0 0 1

Error:[4.21094076]
A  : [0 1 1 0 1 0 0 1]
B  : [0 1 1 1 1 0 1 1]
Pred:[0 1 1 1 1 0 0 1]
True:[1 1 1 0 0 1 0 0]
105 + 123 = 121
------------
Error:[3.57294456]
A  : [0 1 0 1 0 0 1 1]
B  : [0 0 0 1 1 1 0 1]
Pred:[0 1 0 1 1 0 0 1]
True:[0 1 1 1 0 0 0 0]
83 + 29 = 89
------------
Error:[3.96801719]
A  : [0 1 0 0 1 1 0 1]
B  : [0 0 1 1 0 1 1 0]
Pred:[0 0 1 0 0 1 0 1]
True:[1 0 0 0 0 0 1 1]
77 + 54 = 37
------------
Error:[4.00463569]
A  : [0 1 0 1 1 0 1 0]
B  : [0 1 0 1 0 1 1 1]
Pred:[0 1 0 0 0 0 1 1]
True:[1 0 1 1 0 0 0 1]
90 + 87 = 67
------------
Error:[3.70265804]
A  : [0 0 0 0 1 0 1 1]
B  : [0 1 1 0 0 1 1 0]
Pred:[0 1 1 0 0 0 1 1]
True:[0 1 1 1 0 0 0 1]
11 + 102 = 99
------------
Error:[4.14489117]
A  : [0 1 1 0 1 1 1 1]
B  : [0 1 0 0 1 0 1 0]
Pred:[0 1 0 0 1 0 1 1]
True:[1 0 1 1 1 0 0 1]
111 + 74 = 75
------------
Error:[3.3946783]
A  : [0 1 0 0 0 1 0 1]
B  : [0 0 1 1 1 0 0 0]
Pred:[0 0 1 1 0 1 0 1]
True:[0 1 1 1 1 1 0 1]
69 + 56 = 53
------------
Error:[3.98497352]
A  : [0 0 0 0 0 1 

### Summary
I would like to make RNN dynamic by changing the interval between each layer.  
So i proposed that norm between $X_t$ and $X_{t+1}$ will can measure the length between $X_t$ and $X_{t+1}$.

$$ || X_{t+1} - X_t ||$$


So i have chenged the code like below.  
But it turns out that training time takes longer than before,  
and, accuracy was terible...  
Thing was as you can see, the norm is frequently "0".  
so it vanishes the part from previous hidden layer.  

In [3]:
# layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h)*np.exp(norm))