In [1]:
import pandas as pd
train = pd.read_csv('train.csv')

In [2]:
import numpy as np
train[train.columns[1:]] = train[train.columns[1:]].astype(np.float32)

train.shape

(13765201, 24)

In [3]:
good_ids = set(train.loc[train['Ref'].notna(), 'Id'])
train = train[train['Id'].isin(good_ids)]

train.shape

(9125329, 24)

In [4]:
train.fillna(0.0, inplace=True)
train.reset_index(drop=True, inplace=True)
train.head()

Unnamed: 0,Id,minutes_past,radardist_km,Ref,Ref_5x5_10th,Ref_5x5_50th,Ref_5x5_90th,RefComposite,RefComposite_5x5_10th,RefComposite_5x5_50th,...,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th,Expected
0,2,1.0,2.0,9.0,5.0,7.5,10.5,15.0,10.5,16.5,...,0.998333,0.375,-0.125,0.3125,0.875,1.059998,-1.410004,-0.350006,1.059998,1.016001
1,2,6.0,2.0,26.5,22.5,25.5,31.5,26.5,26.5,28.5,...,1.005,0.0625,-0.1875,0.25,0.6875,0.0,0.0,0.0,1.409988,1.016001
2,2,11.0,2.0,21.5,15.5,20.5,25.0,26.5,23.5,25.0,...,1.001667,0.3125,-0.0625,0.3125,0.625,0.349991,0.0,-0.350006,1.759995,1.016001
3,2,16.0,2.0,18.0,14.0,17.5,21.0,20.5,18.0,20.5,...,1.001667,0.25,0.125,0.375,0.6875,0.349991,-1.059998,0.0,1.059998,1.016001
4,2,21.0,2.0,24.5,16.5,21.0,24.5,24.5,21.0,24.0,...,0.998333,0.25,0.0625,0.1875,0.5625,-0.350006,-1.059998,-0.350006,1.759995,1.016001


In [5]:
THRESHOLD = 80
train = train[train['Expected'] <THRESHOLD]

train['Expected'].plot.hist(bins=450, alpha=0.5)

train.shape

(8931892, 24)

In [7]:
train_groups = train.groupby("Id")
train_size = len(train_groups)

MAX_SEQ_LENGTH = train_groups.size().max()
N_FEATURES = 22

X_train = np.zeros((train_size, MAX_SEQ_LENGTH, N_FEATURES), dtype=np.float32)
y_train = np.zeros(train_size, dtype=np.float32)

i = 0
for _, group in train_groups:
    X = group.values
    seq_len = X.shape[0]
    X_train[i,:seq_len,:] = X[:,1:23]
    y_train[i] = X[0,23]
    i += 1
    del X
    
del train_groups
X_train.shape, y_train.shape

((715302, 19, 22), (715302,))

##### Data Augmentation - Let's get triple the data set size, and lengthen MAS_SEQ_LENGTH by 5.

In [8]:
import random
aug_X_train=[]
for j in range(len(X_train)):
    sam = X_train[j]
    sam = sam.tolist()
    for i in range(3):
        a = random.sample(range(19),5)
        a.sort()
        aug_X_train.append(sam[0:a[0]] + [sam[a[0]]] + sam[a[0]:a[1]] + [sam[a[1]]] + sam[a[1]:a[2]] + [sam[a[2]]] + sam[a[2]:a[3]] + [sam[a[3]]] + sam[a[3]:a[4]] + [sam[a[4]]]+ sam[a[4]:len(sam)])

In [9]:
aug_X_train = np.array(aug_X_train)

In [10]:
aug_X_train.shape

(2145906, 24, 22)

In [11]:
aug_y_train=[]
for i in range(len(y_train)):
    aug_y_train.extend([y_train[i]]*3)
    
aug_y_train = np.array(aug_y_train)

Test Set

In [15]:
test = pd.read_csv("test.csv")
test[test.columns[1:]] = test[test.columns[1:]].astype(np.float32)
test_ids = test['Id'].unique()

# Convert all NaNs to zero
test = test.fillna(0.0)
test = test.reset_index(drop=True)

test_groups = test.groupby("Id")
test_size = len(test_groups)

MAX_SEQ_LENGTH = 19

X_test = np.zeros((test_size, MAX_SEQ_LENGTH, N_FEATURES), dtype=np.float32)

i = 0
for _, group in test_groups:
    X = group.values
    seq_len = X.shape[0]
    X_test[i,:seq_len,:] = X[:,1:23]
    i += 1
    del X
    
del test_groups
X_test.shape

(717625, 19, 22)

In [16]:
from keras.layers import (
    Input,
    Dense,
    LSTM,
    AveragePooling1D,
    TimeDistributed,
    Flatten,
    Bidirectional,
    Dropout
)
from keras.models import Model

from keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_delta=0.01)

BATCH_SIZE = 2028
N_EPOCHS = 30

def get_model_deep(shape=(19, 22)):
    inp = Input(shape)
    x = Dense(16)(inp)
    x = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = TimeDistributed(Dense(32))(x)
    x = Bidirectional(LSTM(32, return_sequences=True))(x)
    x = TimeDistributed(Dense(32))(x)
    x = AveragePooling1D()(x)
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(1)(x)

    model = Model(inp, x)
    return model

model = get_model_deep((19,22))
model.compile(optimizer='adam', loss='mae',)
model.summary()

model.fit(X_train, y_train, 
            batch_size=BATCH_SIZE, epochs=N_EPOCHS, 
            validation_split=0.2, callbacks=[early_stopping, reduce_lr])

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 19, 22)            0         
_________________________________________________________________
dense_7 (Dense)              (None, 19, 16)            368       
_________________________________________________________________
bidirectional_3 (Bidirection (None, 19, 64)            12544     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 19, 32)            2080      
_________________________________________________________________
bidirectional_4 (Bidirection (None, 19, 64)            16640     
_________________________________________________________________
time_distributed_4 (TimeDist (None, 19, 32)            2080      
_________________________________________________________________
average_pooling1d_3 (Average (None, 9, 32)             0   

<keras.callbacks.History at 0x123be6198>

In [17]:
y_pred = model.predict(X_test, batch_size=BATCH_SIZE)
submission = pd.DataFrame({'Id': test_ids, 'Expected': y_pred.reshape(-1)})
submission.to_csv('submission.csv', index=False)

In [18]:
submission.shape

(717625, 2)