# OpenAI - Request for Research 2.0
## Warmup 1
https://blog.openai.com/requests-for-research-2/

Train an LSTM to solve the XOR problem: that is, given a sequence of bits, determine its parity. The LSTM should consume the sequence, one bit at a time, and then output the correct answer at the sequence’s end. Test the two approaches below:
* Generate a dataset of random 100,000 binary strings of length 50. Train the LSTM; what performance do you get?
* Generate a dataset of random 100,000 binary strings, where the length of each string is independently and randomly chosen between 1 and 50. Train the LSTM. Does it succeed? What explains the difference?

# Dependencies

In [8]:
import numpy as np

from keras import optimizers
from keras.callbacks import Callback
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
import keras.backend as K

# Training Data

In [9]:
max_len = 50
batch_size = 128
training_size_v1 = 100000
training_size_v2 = 10000
test_size = 10000

In [10]:
def gen_training_example(training_size=100000, str_len=50):
    """
    1 << n is the first number with n+1 digits
    in other words, (1 << n) - 1 is the last number with n digits
    """
    low = 0
    high = (1 << str_len)
    for __ in range(training_size):
        num = np.random.randint(low, high)
        bits = np.binary_repr(num, width=str_len)
        
        X = np.zeros((str_len, 2))
        Y = np.zeros((str_len, 2))
        
        parity = 0
        
        for i, bit in enumerate(bits):
            parity ^= int(bit)
            X[i, int(bit)] = 1
            Y[i, parity] = 1
        
        yield X, Y

In [11]:
def load_data_v1(training_size=100000, string_len=50, padding=True):
    np.random.seed(0)
    inputs = np.zeros((training_size, string_len, 2))
    outputs = np.zeros((training_size, string_len, 2))
    for i, (X, Y) in enumerate(gen_training_example(training_size, string_len)):
        inputs[i,:,:] = X
        outputs[i,:,:] = Y
    
    return inputs, outputs

In [25]:
def load_data_v2(training_size=10000, string_len=50, padding=False):
    np.random.seed(0)
    
    for __ in range(training_size):
        seq_len = np.random.randint(1, string_len + 1)        
        inputs = np.zeros((1, seq_len, 2))
        outputs = np.zeros((1, seq_len, 2))
        
        for j, (X, Y) in enumerate(gen_training_example(1, seq_len)):
            inputs[j,:,:] = X
            outputs[j,:,:] = Y
        yield inputs, outputs

# RNN Model

In [13]:
def XOR_Model(input_shape):
    binary_string = Input(shape=input_shape, dtype='float32')
    X = LSTM(1, kernel_initializer='glorot_normal', return_sequences=True)(binary_string)
    X = Dense(2, activation=K.softmax)(X)
    X = Activation(K.softmax)(X)
    
    model = Model(inputs=[binary_string], outputs=[X])
    return model

# Test 1: Fixed Training Size
A basic LSTM (hidden unit size 2) is trained with fixed training input sequence length of 50.
This model achieves 100% accuracy within 6-8 epochs of training with batch gradient descent.

In [14]:
model = XOR_Model(input_shape=(max_len, 2))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 50, 2)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 1)             16        
_________________________________________________________________
dense_2 (Dense)              (None, 50, 2)             4         
_________________________________________________________________
activation_2 (Activation)    (None, 50, 2)             0         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
np.random.seed(1)
X_train, Y_train = load_data_v1(training_size_v1, max_len, True)
X_test, Y_test = load_data_v1(test_size, max_len, True)
print('input shape: {} // output shape: {}'.format(X_train.shape, Y_train.shape))

input shape: (100000, 50, 2) // output shape: (100000, 50, 2)


In [17]:
# should perform close to 100%
model.fit(X_train, Y_train, epochs=7, validation_data=(X_test, Y_test), batch_size=batch_size)

Train on 100000 samples, validate on 10000 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f8a70ab5a20>

# Test 2: Variable Training Size
The same model is trained, but with variable training input sequence length between 1 and 50 inclusive (uniform distribution).
Randomly selected input doesn't allow us to take advantage of vectorization, and therefore this model is trained with stochastic gradient descent with epoch size of 5 for each example.
This model achieves 100% accuracy in both training/test sets after training on ~2000 examples with 5 epoch.

In [21]:
model_v2 = XOR_Model(input_shape=(None, 2))
model_v2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, None, 2)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 1)           16        
_________________________________________________________________
dense_4 (Dense)              (None, None, 2)           4         
_________________________________________________________________
activation_4 (Activation)    (None, None, 2)           0         
Total params: 20
Trainable params: 20
Non-trainable params: 0
_________________________________________________________________


In [27]:
model_v2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
# Hack to handle variable length sequence in arbitrary order
for i, (X_train_v2, Y_train_v2) in enumerate(load_data_v2(training_size_v2, max_len, False)):
    if i % 2000 == 0:
        print('{} examples trained (each with 5 epoch)'.format(i+1))
        model_v2.fit(X_train_v2, Y_train_v2, epochs=5, batch_size=1)
    else:
        model_v2.fit(X_train_v2, Y_train_v2, epochs=5, batch_size=1, verbose=0)

1 examples trained (each with 5 epoch)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2001 examples trained (each with 5 epoch)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
4001 examples trained (each with 5 epoch)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
6001 examples trained (each with 5 epoch)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
8001 examples trained (each with 5 epoch)
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
model_v2.evaluate(X_test, Y_test, batch_size=32)



[0.3132616877555847, 1.0]

# For Debugging

## Display Layer Weights

In [None]:
for layer in model.layers:
    weights = layer.get_weights()
    print(weights)

## Test Arbitrary Example

In [None]:
i = np.random.randint(0, test_size)
x = X_test[i]
y = Y_test[i]
x = x.reshape((1, max_len, 2))
print('y: {} // y_hat: {}'.format(y[0],model.predict(x)[0,0]))