<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

import keras.layers as KL
import keras.models as KM
import keras.callbacks as cb

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('../dataset/train.csv', index_col=0)
test = pd.read_csv('../dataset/test.csv', index_col=0)
print('data.shape: ',data.shape)
print('test.shape: ',test.shape)

data.shape:  (200000, 201)
test.shape:  (200000, 200)


In [3]:
x_data = data.drop('target', axis=1)
y_data = data.target

In [4]:
class DAE:
    def __init__(self, col_size):
        self.col_size = col_size
        self.inp = KL.Input((self.col_size,))
        en = self.encoder()
        de = self.decoder(en)
        
        self.model = KM.Model(self.inp, de)
        
    
    def encoder(self):
        x = KL.Dense(250)(self.inp)
        x = KL.Dropout(0.4)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('tanh')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        en = KL.Activation('linear')(x)

        return en

    def decoder(self, en_output):
        x = KL.Dense(250)(en_output)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('tanh')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(self.col_size)(x)
        x = KL.BatchNormalization()(x)
        de = KL.Activation('linear')(x)

        return de

    def feature_output(self):
        en = self.encoder()
        encoder = KM.Model(self.inp, en)
        encoder.set_weights(self.model.get_weights())
        return encoder

In [17]:
# scaler = MinMaxScaler()
# scaler.fit(x_data)
# x_data_s = scaler.transform(x_data)
# test_s = scaler.transform(test)
# x_data_s = pd.DataFrame(x_data_s, columns=x_data.columns)
# test_s = pd.DataFrame(test_s, columns=test.columns)
# data_s = pd.concat([x_data_s, test_s], axis=0)
# data_s.describe()

In [18]:
data = pd.concat([x_data, test], axis=0)

In [28]:
dae = DAE(data_s.shape[1])
dae.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
batch_normalization_25 (Batc (None, 100)               400       
_________________________________________________________________
activation_25 (Activation)   (None, 100)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 50)                5050      
_________________________________________________________________
batch_normalization_26 (Batc (None, 50)                200       
__________

In [31]:
dae.model.compile(optimizer='adam',loss='mse')
dae.model.fit(data, data,
             batch_size=1024,
             epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x14685e6b320>

In [23]:
prediction = dae.model.predict(data)
print('prediction.shape: ',prediction.shape)
print('mse: ',np.mean(np.mean(np.square(prediction-data))))

prediction.shape:  (400000, 200)
mse:  132.71506227028726


In [24]:
new_train = dae.model.predict(x_data)
new_test = dae.model.predict(test)
print('train mse: ',np.mean(np.mean(np.square(new_train-x_data))))
print('test mse: ',np.mean(np.mean(np.square(new_test-test))))

train mse:  132.75108942047132
test mse:  132.67903512010326


In [32]:
encoder = dae.feature_output()
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_32 (Dense)             (None, 100)               20100     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
batch_normalization_32 (Batc (None, 100)               400       
_________________________________________________________________
activation_32 (Activation)   (None, 100)               0         
_________________________________________________________________
dense_33 (Dense)             (None, 50)                5050      
_________________________________________________________________
batch_normalization_33 (Batc (None, 50)                200       
__________

In [33]:
new_train = encoder.predict(x_data)
new_test = encoder.predict(test)

In [34]:
colnames = ['val_{}'.format(i) for i in range(new_train.shape[1])]
new_train = pd.DataFrame(new_train, columns=colnames)
new_test = pd.DataFrame(new_test, columns=colnames)

In [35]:
print('new_train.shape: ',new_train.shape)
print('new_test.shape: ',new_test.shape)

new_train.shape:  (200000, 25)
new_test.shape:  (200000, 25)


In [36]:
config = {
    "seed":2019,
    "k_folds":5,
    "early_stopping_rounds":100
}

params = {
    "learning_rate": 0.1,
    "n_estimators": 10000,
    "max_depth": 3,
    "min_child_weight": 5,
    "subsample": 1.0,
    "colsample_bytree": 0.5,
    "colsample_bylevel": 0.5,
    "alpha": 0,
    "lambda": 10,
    "objective": "gpu:binary:logistic",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric":"auc"
}

In [37]:
folds = StratifiedKFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)

In [38]:
auc_list = list()
probs = np.zeros(len(test))

for i, (train_idx, valid_idx) in enumerate(folds.split(X=new_train, y=y_data)):
    print('='*25)
    print('{} fold'.format(i))
    x_train, y_train = new_train.iloc[train_idx, :], y_data.iloc[train_idx]
    x_valid, y_valid = new_train.iloc[valid_idx, :], y_data.iloc[valid_idx]

    watchlist = [(x_train,y_train),(x_valid, y_valid)]
    model = xgb.XGBClassifier(**params)
    model.fit(x_train, y_train,
            eval_set=watchlist,
            early_stopping_rounds=config['early_stopping_rounds'],
            verbose=100)

    val_prob = model.predict_proba(x_valid, ntree_limit=model.best_iteration)[:,1]
    prob = model.predict_proba(new_test, ntree_limit=model.best_iteration)[:,1]
    
    val_auc = metrics.roc_auc_score(y_valid, val_prob)
    auc_list.append(val_auc)
    print('val AUC: ',val_auc)

    probs = prob/folds.get_n_splits()

print('='*100)
print('AUC_LIST')
print(auc_list)

print('-'*100)
print('Mean AUC: {}'.format(np.mean(auc_list)))

0 fold
[0]	validation_0-auc:0.556974	validation_1-auc:0.557523
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 100 rounds.
[100]	validation_0-auc:0.655844	validation_1-auc:0.634132
[200]	validation_0-auc:0.674539	validation_1-auc:0.641617
[300]	validation_0-auc:0.685321	validation_1-auc:0.643275
[400]	validation_0-auc:0.693993	validation_1-auc:0.644145
[500]	validation_0-auc:0.702629	validation_1-auc:0.645078
[600]	validation_0-auc:0.710386	validation_1-auc:0.645364
[700]	validation_0-auc:0.718005	validation_1-auc:0.64624
[800]	validation_0-auc:0.725268	validation_1-auc:0.645944
Stopping. Best iteration:
[726]	validation_0-auc:0.719885	validation_1-auc:0.646483

val AUC:  0.6464259329239685
1 fold
[0]	validation_0-auc:0.558929	validation_1-auc:0.549559
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't impro