In [1]:
import pandas as pd
import pandas_profiling as pp


In [2]:
import os
data_path = '/home/srgrace/genericContest_data/cat-in-the-dat-ii'

train = pd.read_csv(os.path.join(data_path, 'train.csv'))
test = pd.read_csv(os.path.join(data_path, 'test.csv'))

In [3]:
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [4]:
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,F,Y,Blue,Polygon,Axolotl,Finland,...,ca9ad1d4b,fced9e114,3.0,Novice,Boiling Hot,f,U,oU,3.0,9.0
1,600001,0.0,0.0,0.0,F,Y,Red,Circle,Lion,Russia,...,060a21580,7ca8775da,1.0,Novice,Cold,n,N,,2.0,8.0
2,600002,0.0,0.0,0.0,F,Y,Blue,Circle,Axolotl,Russia,...,165e81a00,5940334c9,1.0,Expert,Warm,i,N,DN,2.0,6.0
3,600003,1.0,0.0,0.0,F,N,Red,Polygon,Axolotl,Costa Rica,...,77d41330d,6fbdeefc8,1.0,Expert,Hot,m,B,AG,1.0,6.0
4,600004,0.0,0.0,1.0,F,Y,Red,Circle,,Finland,...,2218d9dfe,2a27c8fde,1.0,Contributor,Lava Hot,o,J,DT,3.0,3.0


In [5]:
import warnings
warnings.filterwarnings("ignore")

import gc
import joblib
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils



In [6]:
test['target'] = -1
data = pd.concat([train, test]).reset_index(drop=True)

features = [x for x in train.columns if x not in ['id', 'target']]

for feat in features:
    label_enc = preprocessing.LabelEncoder()
    data[feat] = label_enc.fit_transform(data[feat].fillna('-1').astype(str).values)

In [7]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [8]:
train.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
45976,45976,1,1,1,1,2,3,6,3,6,...,1782,2,2,5,14,5,98,3,9,0
185790,185790,1,1,2,1,1,3,2,5,3,...,1995,1,4,2,8,25,181,6,5,0
81693,81693,1,1,1,1,2,1,2,5,3,...,856,2,5,3,8,15,3,5,9,0
89677,89677,1,1,2,0,1,3,4,6,6,...,1332,2,1,3,3,20,150,3,5,0
183136,183136,1,1,1,2,1,1,5,5,6,...,599,3,3,5,4,5,34,5,6,0


In [9]:
pp.ProfileReport(train)



In [9]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return metrics.roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

In [10]:
def create_model(data, feats):
    inps = []
    outs = []
    for feat in feats:
        num_unique_vals = int(data[feat].nunique())
        embed_dim = int(min(np.ceil(num_unique_vals/2), 50))
        
        inp = layers.Input(shape=(1, ))
        out = layers.Embedding(num_unique_vals + 1, embed_dim, name=feat)(inp)
        
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        
        inps.append(inp)
        outs.append(out)
    
    x = layers.Concatenate()(outs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    y = layers.Dense(2, activation='softmax')(x)
    
    model = Model(inputs=inps, outputs=y)
    return model

In [11]:
test_data = [test.loc[:, features].values[:, k] for k in range(test.loc[:, features].values.shape[1])]
oof_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))


skf = StratifiedKFold(n_splits=50)
for train_idx, test_idx in skf.split(train, train.target.values):
    
    x_train, x_test = train.iloc[train_idx, :], train.iloc[test_idx, :]
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    
    y_train, y_test = x_train.target.values, x_test.target.values
    
    model = create_model(data, features)
    model.compile(loss='binary_crossentropy', optimizers='adam', metrics=[auc])
    
    x_train = [x_train.loc[:, features].values[:, k] for k in range(x_train.loc[:, features].values.shape[1])]
    x_test = [x_test.loc[:, features].values[:, k] for k in range(x_test.loc[:, features].values.shape[1])]
    
    early_stop = callbacks.EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5, verbose=1, mode='max',
                                         baseline=None, restore_best_weights=True)
    
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=3, min_lr=1e-6, 
                                           mode='max', verbose=1)
    
    model.fit(x_train, utils.to_categorical(y_train),
             validation_data=(x_test, utils.to_categorical(y_test)),
             verbose=1, batch_size=1024, callbacks=[early_stop, reduce_lr], epochs=100)
    
    valid_fold_preds = model.predict(x_test)[:, 1]
    test_fold_preds = model.predict(test_data)[:, 1]
    oof_preds[test_idx] = valid_fold_preds.ravel()
    test_preds += test_fold_preds.ravel()
    print(metrics.roc_auc_score(y_test, valid_fold_preds))
    K.clear_session()

Train on 587999 samples, validate on 12001 samples
Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
0.7815381208161356
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00008: early stopping
0.7956545436975224
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
0.7923290562433095
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
0.7867165105384173
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7797881284128958
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7855468832341009
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.78926363

Epoch 6/100
Epoch 00006: early stopping
0.7848010547679412
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00009: early stopping
0.7874888478192371
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00007: early stopping
0.7954765103157615
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.7860586037346494
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 

Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.7860930515058512
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 00007: early stopping
0.7882098784527504
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
0.7881175766764043
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7785014471257737
Train on 587999 samples, va

0.7881146337873413
Train on 587999 samples, validate on 12001 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 00007: early stopping
0.7886696763524673
Train on 588000 samples, validate on 12000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00007: early stopping
0.785922199007426
Train on 588000 samples, validate on 12000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00007: early stopping
0.7924471381560748
Train on 588000 samples, validate on 12000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7864923694569399
Train on 588

Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
0.7801357145720126
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00008: early stopping
0.7805285886416755
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7873955306945307
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopp

Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.7851076988983183
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7894080402139434
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
0.7932847842146249
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.783528373442005
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/

Epoch 00008: early stopping
0.781378590819237
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00006: early stopping
0.7810242919981056
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
0.7836817842380894
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
0.7861212236087094
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 00007: early stopping
0.7831266430430

Epoch 00008: early stopping
0.7931852874641216
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 00007: early stopping
0.7886830994486341
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
0.7803017707454263
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/100
Epoch 00006: early stopping
0.7876741170308217
Train on 588001 samples, validate on 11999 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.000500000

In [12]:
print("Overall AUC={}".format(metrics.roc_auc_score(train.target.values, oof_preds)))

Overall AUC=0.782480002734998


In [13]:
test_preds /= 50
test_ids = test.id.values
print("Saving submission file")
submission = pd.DataFrame.from_dict({
    'id': test_ids,
    'target': test_preds
})
submission.to_csv("data/submission-23.csv", index=False)

Saving submission file
