In [1]:
import pandas as pd
import numpy as np

In [3]:
import keras

In [None]:
from sklearn.model_selection import train_test_split

**Prepare Data**

In [4]:
def batch_generator(inputs, outputs):
    while True:
        for size in range(1, 205):
            batches = [
                (sequence, target)
                for sequence, target in zip(inputs, outputs)
                if len(sequence) == size
            ]

            x, y = zip(*batches)
            batch = np.array(x)
            output_batch = np.array(y)

            try:
                batch = batch.reshape((batch.shape[0], batch.shape[1], 1))
                output_batch = output_batch.reshape((output_batch.shape[0], 1))
            except IndexError:
                batch = batch.reshape((batch.shape[0], 1, 1))
                output_batch = output_batch.reshape((output_batch.shape[0], 1))

            yield batch, output_batch


def format_sequences_to_input(sequences):
    inputs = sequences.drop(columns=["opponent", "gene_204"]).values
    max_length = len(inputs[0])

    prep_X_train = []
    for histories in range(1, max_length + 1):
        for sequence in inputs:
            assert len(sequence) == max_length
            prep_X_train.append(sequence[:histories])

    return np.array(prep_X_train)


def format_sequences_to_output(sequences):
    inputs = sequences.drop(columns=["opponent", "gene_0"]).values
    max_length = len(inputs[0])

    prep_y_train = []
    for histories in range(1, max_length + 1):
        for sequence in inputs:
            assert len(sequence) == max_length
            prep_y_train.append(sequence[histories - 1])

    return np.array(prep_y_train)

In [6]:
outputs = pd.read_csv('data/targets.csv', index_col=0)
y = format_sequences_to_output(outputs)

In [8]:
len(y)

1122612

In [10]:
sequences = pd.read_csv('data/sequences.csv', index_col=0)
inputs = format_sequences_to_input(sequences)

In [16]:
inputs[10000]

array([1, 1])

In [18]:
input_train, input_test, output_train, output_test = train_test_split(
    inputs, y, test_size=0.2, random_state=0
)

In [19]:
trainGen = batch_generator(input_train, output_train)
testGen = batch_generator(input_test, output_test)

In [28]:
x, y = trainGen.__next__()

In [29]:
x

array([[[1],
        [1],
        [1]],

       [[1],
        [1],
        [1]],

       [[1],
        [1],
        [1]],

       ...,

       [[1],
        [1],
        [1]],

       [[1],
        [1],
        [1]],

       [[1],
        [1],
        [1]]])

In [30]:
y

array([[1],
       [1],
       [1],
       ...,
       [1],
       [1],
       [1]])

**Creating the model**

In [31]:
import tensorflow as tf
import numpy as np

In [32]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Dropout, CuDNNLSTM

In [34]:
model = Sequential()

model.add(CuDNNLSTM(1, return_sequences=True, input_shape=(None, 1)))

model.add(Dropout(rate=0.2))

model.add(Dense(1, activation='sigmoid'))

In [36]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [37]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_2 (CuDNNLSTM)     (None, None, 1)           16        
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 1)           0         
_________________________________________________________________
dense_2 (Dense)              (None, None, 1)           2         
Total params: 18
Trainable params: 18
Non-trainable params: 0
_________________________________________________________________


In [39]:
# history = model.fit_generator(trainGen,
#                               steps_per_epoch=202,
#                               epochs=1,
#                               verbose=1,
#                               )

**Predict**

In [None]:
model.predict(np.array([0, 1, 1]).reshape(1, 3, 1))

In [None]:
model.predict(np.array([0, 1, 1, 1, 0, 0, 0]).reshape(1, 7, 1))

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig, ax = plt.subplots();

plt.plot(history.history['acc'], label='accuracy', color='red', linestyle='--');
plt.plot(history.history['val_acc'], label=' validation accuracy');

plt.legend();

plt.savefig('accuracy_plot.pdf')

In [None]:
fig, ax = plt.subplots();

plt.plot(history.history['loss'], label='loss', color='red', linestyle='--');
plt.plot(history.history['val_loss'], label=' validation loss');

plt.legend();

**LSTM**

In [40]:

def batch_generator(inputs, outputs):
    while True:
        for size in range(1, 205):
            batches = [
                (sequence, target)
                for sequence, target in zip(inputs, outputs)
                if len(sequence) == size
            ]

            x, y = zip(*batches)
            batch = np.array(x)
            output_batch = np.array(y)

            try:
                batch = batch.reshape((batch.shape[0], batch.shape[1], 1))
                output_batch = output_batch.reshape(
                    (output_batch.shape[0], output_batch.shape[1], 1)
                )
            except IndexError:
                batch = batch.reshape((batch.shape[0], 1, 1))
                output_batch = output_batch.reshape(
                    (output_batch.shape[0], 1, 1)
                )

            yield batch, output_batch


def format_sequences_to_input(sequences):
    inputs = sequences.drop(columns=["opponent", "gene_204"]).values
    max_length = len(inputs[0])

    prep_X_train = []
    for histories in range(1, max_length + 1):
        for sequence in inputs:
            assert len(sequence) == max_length
            prep_X_train.append(sequence[:histories])

    return np.array(prep_X_train)


def format_sequences_to_output(sequences):
    inputs = sequences.drop(columns=["opponent", "gene_0"]).values
    max_length = len(inputs[0])

    prep_y_train = []
    for histories in range(1, max_length + 1):
        for sequence in inputs:
            assert len(sequence) == max_length
            prep_y_train.append(sequence[:histories])

    return np.array(prep_y_train)

In [60]:
outputs = pd.read_csv('data/targets.csv', index_col=0)
y = format_sequences_to_output(outputs)

In [61]:
sequences = pd.read_csv('data/sequences.csv', index_col=0)
inputs = format_sequences_to_input(sequences)
input_train, input_test, output_train, output_test = train_test_split(
    inputs, y, test_size=0.2, random_state=0
)

In [43]:
trainGen = batch_generator(input_train, output_train)
testGen = batch_generator(input_test, output_test)

In [44]:
x, y = trainGen.__next__()

In [64]:
y[-4]

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [66]:
inputs[-4]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])