In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [30]:
X_train = pd.read_csv('../data/processed/rain_X_train.csv', header=None, nrows=200000)
y_train = pd.read_csv('../data/processed/rain_y_train.csv', header=None, nrows=200000)

In [31]:
X_train.rename({0: 'oid'}, axis=1, inplace=True)
y_train.rename({0: 'oid'}, axis=1, inplace=True)

In [32]:
np.max(X_train.oid)

302922.0

In [33]:
cts = pd.DataFrame(X_train.groupby('oid')['oid'].value_counts().droplevel(0))
cts.columns = ['counts']
cts

Unnamed: 0_level_0,counts
oid,Unnamed: 1_level_1
1.0,1
2.0,25
3.0,2
4.0,3
5.0,2
...,...
302914.0,21
302918.0,4
302919.0,3
302921.0,16


In [34]:
seq_lengths = cts.reset_index().groupby('counts')['counts'].value_counts().droplevel(0)
seq_lengths

counts
1      3199
2      3763
3      1975
4      1350
5       933
       ... 
443       1
460       1
478       1
556       1
621       1
Name: counts, Length: 224, dtype: int64

In [35]:
sizes = np.array(seq_lengths.index)

# randomize the order of the sizes for batching
np.random.choice(sizes, sizes.shape[0], replace=False)

array([103,   6, 272, 138, 294,  12, 235, 621,  75,   5,  49,  54,  64,
        57,  61, 108, 176,  96,  77, 146, 162, 172,   3, 153, 136, 141,
       298,  98, 107,  62,  38,  87,  66, 322, 152,  31, 120,  35, 311,
       150, 125,  16, 326,  10,  19, 167,  85,   4, 201,  33,  56,  37,
       149,  47,  52, 137, 202,  29, 158, 117, 119, 135, 143,  45, 268,
        82, 229,  92, 160, 224,  78, 312, 203, 282,  46,  42, 110, 130,
        22,  97,  13, 126, 286, 164,  41, 123, 262,   1, 102,  58,  32,
       183, 157, 290,  71,  83,  72, 274,  39,  27, 173, 144, 154, 168,
        26, 247, 165,  20, 200, 163, 127, 192, 236,  63, 264,   7,  14,
       460, 196, 364,  73,  50,  59,  99,  81,  60,  86, 443, 113,   2,
       198, 185, 139,  34, 133, 118,  65, 242, 205, 187, 145, 556,  18,
        89,  48,  23,  79, 116, 100, 478, 227,  68,  69,  40, 166,  21,
       111,  15, 101, 134, 112, 207,  94, 233, 184, 174,  95,  74, 131,
       208, 115, 105, 191,  28, 169,  11,  44,  70,  90, 175, 14

In [36]:
# use for batching with the same size
oids_by_size_dict = {
    size: cts.query('counts == @size').index.tolist() for size in sizes
}
oids_by_size_dict

{1: [1.0,
  10.0,
  13.0,
  23.0,
  24.0,
  32.0,
  37.0,
  38.0,
  40.0,
  41.0,
  44.0,
  55.0,
  76.0,
  79.0,
  89.0,
  95.0,
  96.0,
  97.0,
  124.0,
  129.0,
  142.0,
  153.0,
  163.0,
  168.0,
  171.0,
  186.0,
  201.0,
  216.0,
  239.0,
  254.0,
  271.0,
  272.0,
  273.0,
  285.0,
  295.0,
  297.0,
  324.0,
  325.0,
  326.0,
  334.0,
  340.0,
  348.0,
  353.0,
  359.0,
  360.0,
  363.0,
  366.0,
  388.0,
  391.0,
  393.0,
  412.0,
  413.0,
  415.0,
  426.0,
  427.0,
  435.0,
  445.0,
  447.0,
  452.0,
  457.0,
  461.0,
  467.0,
  478.0,
  480.0,
  483.0,
  489.0,
  491.0,
  497.0,
  498.0,
  507.0,
  514.0,
  516.0,
  538.0,
  561.0,
  575.0,
  583.0,
  586.0,
  589.0,
  591.0,
  596.0,
  606.0,
  613.0,
  630.0,
  635.0,
  640.0,
  648.0,
  652.0,
  656.0,
  657.0,
  670.0,
  684.0,
  689.0,
  693.0,
  705.0,
  707.0,
  708.0,
  711.0,
  713.0,
  714.0,
  718.0,
  743.0,
  749.0,
  753.0,
  755.0,
  769.0,
  784.0,
  794.0,
  815.0,
  820.0,
  822.0,
  824.0,
  847.0,
  851.0,

In [80]:
def split_at_idx(a: np.ndarray, idx: int):
    '''Splits an array into two at the index given.
    
    Example:
    x = np.arange(20.0)
    while len(x) > 0:
        x_new, x = split_at_idx(x, 3)
        print(x_new)    
    '''
    return a[:idx], a[idx:]

x = np.arange(20.0)
while x.any():
    x_new, x = split_at_idx(x, 3)
    print(x_new)

[0. 1. 2.]
[3. 4. 5.]
[6. 7. 8.]
[ 9. 10. 11.]
[12. 13. 14.]
[15. 16. 17.]
[18. 19.]


In [81]:
def prepare_batch(X, y, oids):
    '''Takes X and y data frames and a list of oids and returns X_batch and y_batch, which
       are (batch_size, time_steps, n_features) shaped numpy arrays
    '''
    X = np.array(X.query('oid in @oids').drop('oid', axis=1))
    y = np.array(y.query('oid in @oids').drop('oid', axis=1))
    n = len(oids)
    time_steps = int(X.shape[0] / n)
    n_features = X.shape[1]

    return np.reshape(X, (n, time_steps, n_features)), np.reshape(y, (n, time_steps, 2))


In [9]:
def train_epoch(X, y, oids_by_size, max_batch_size=32):
    
    # randomize sizes
    rand_sizes = np.random.choice(sizes, sizes.shape[0], replace=False)
    
    for size in sizes:
        oids = oids_by_size[size]
        n_oids = len(oids)
        rand_oids = np.random.choice(oids, n_oids, replace=False)

        while rand_oids.any():
            batch_oids, rand_oids = split_at_idx(rand_oids, max_batch_size)
            X_batch, y_batch = prepare_batch(X, y, batch_oids)
    
            # use Keras train_on_batch here
    
    

SyntaxError: unexpected EOF while parsing (<ipython-input-9-2f7af28c9a68>, line 17)

In [None]:
# from keras.layers import Input, Dense, Lambda, Reshape, InputLayer, concatenate, Flatten
# from keras.layers.normalization import BatchNormalization
# from keras.layers.convolutional import Conv2D, Deconv2D, Conv2DTranspose
# from keras.models import Model, Sequential
# from keras.preprocessing.image import ImageDataGenerator
# from keras.layers.advanced_activations import LeakyReLU
# from keras import backend as K
# from keras import metrics
# from keras.datasets import mnist
# from keras.utils import np_utils

from keras.layers import Input, Dense, TimeDistributed, LSTM, Dropout
from keras.models import Model


In [84]:
dropout_prob = 0.5
dense_sizes = (20, 10)
lstm_size = 30
n_features = X_train.shape[1] - 1
n_features

44

In [75]:
x_in = Input(batch_shape=(None, None, n_features))

layers = Sequential()
layers.add(TimeDistributed(Dense(dense_sizes[0], activation='relu'), input_shape=(None, n_features)))
layers.add(Dropout(dropout_prob))
layers.add(TimeDistributed(Dense(dense_sizes[1], activation='relu'), input_shape=(None, 20)))
layers.add(Dropout(dropout_prob))
layers.add(LSTM(lstm_size, return_sequences=True))
layers.add(LSTM(lstm_size, return_sequences=True))
layers.add(Dropout(dropout_prob))
layers.add(TimeDistributed(Dense(1, activation='sigmoid')))

calculate_r = Dense(1, activation='linear', kernel_regularizer=) 

933


32

In [None]:
# functional programming
x = x_in(X_batch)
y_hat = layers(x)

In [115]:
ids = oids_by_size_dict[5.][:6]
y_example = np.reshape(np.array(y_train.query('oid in @ids').drop('oid', axis=1)), (6, 5, 2))

# because each batch has the same length, we know if it's censored if the last time_step has 
# a 0 for 'event_occurred'
# to create the mask, convert the latest 0's to -1
y_true = y_example[:, :, 0].copy()
y_true[y_true[:, -1] == 0, -1] = -1
# then mask the -1 values
y_true

# mask = K.cast(K.not_equal(y_true, -1), K.floatx())
# K.binary_crossentropy(y_true * mask, y_pred * mask)

array([[ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  0, -1],
       [ 0,  0,  0,  0, -1],
       [ 0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  1]])