<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

import keras.layers as KL
import keras.models as KM
import keras.callbacks as cb

from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
data = pd.read_csv('../dataset/train.csv', index_col=0)
test = pd.read_csv('../dataset/test.csv', index_col=0)
print('data.shape: ',data.shape)
print('test.shape: ',test.shape)

data.shape:  (200000, 201)
test.shape:  (200000, 200)


In [3]:
x_data = data.drop('target', axis=1)
y_data = data.target

In [4]:
class DAE:
    def __init__(self, col_size):
        self.col_size = col_size
        self.inp = KL.Input((self.col_size,))
        en = self.encoder()
        de = self.decoder(en)
        
        self.model = KM.Model(self.inp, de)
        
    
    def encoder(self):
        x = KL.Dense(250)(self.inp)
        x = KL.Dropout(0.4)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('tanh')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        en = KL.Activation('linear')(x)

        return en

    def decoder(self, en_output):
        x = KL.Dense(250)(en_output)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('tanh')(x)
        x = KL.Dense(250)(x)
        x = KL.BatchNormalization()(x)
        x = KL.Activation('relu')(x)
        x = KL.Dense(self.col_size)(x)
        x = KL.BatchNormalization()(x)
        de = KL.Activation('linear')(x)

        return de

    def feature_output(self, model):
        en = self.encoder()
        encoder = KM.Model(self.inp, en)
        encoder.set_weights(model.get_weights())
        return encoder

In [5]:
# scaler = MinMaxScaler()
# scaler.fit(x_data)
# x_data_s = scaler.transform(x_data)
# test_s = scaler.transform(test)
# x_data_s = pd.DataFrame(x_data_s, columns=x_data.columns)
# test_s = pd.DataFrame(test_s, columns=test.columns)
# data_s = pd.concat([x_data_s, test_s], axis=0)
# data_s.describe()

In [6]:
data = pd.concat([x_data, test], axis=0)

In [7]:
dae = DAE(data.shape[1])
dae.model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 250)               1000      
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)               62750     
_________________________________________________________________
batch_normalization_2 (Batch (None, 250)               1000      
__________

In [8]:
# dae.model.compile(optimizer='adam',loss='mse')
# dae.model.fit(data, data,
#              batch_size=1024,
#              epochs=200)

In [9]:
# dae.model.save('../dae_model.h5')

In [10]:
dae_model = KM.load_model('../dae_model.h5')

In [11]:
# prediction = dae_model.predict(data)
# print('prediction.shape: ',prediction.shape)
# print('mse: ',np.mean(np.mean(np.square(prediction-data))))

prediction.shape:  (400000, 200)
mse:  13.547362402899564


In [12]:
# new_train = dae_model.predict(x_data)
# new_test = dae_model.predict(test)
# print('train mse: ',np.mean(np.mean(np.square(new_train-x_data))))
# print('test mse: ',np.mean(np.mean(np.square(new_test-test))))

train mse:  13.543876897062244
test mse:  13.55084790873696


In [13]:
encoder = dae.feature_output(dae_model)
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 250)               50250     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
batch_normalization_8 (Batch (None, 250)               1000      
_________________________________________________________________
activation_8 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 250)               62750     
_________________________________________________________________
batch_normalization_9 (Batch (None, 250)               1000      
__________

In [14]:
new_train = encoder.predict(x_data)
new_test = encoder.predict(test)

In [15]:
colnames = ['val_{}'.format(i) for i in range(new_train.shape[1])]
new_train = pd.DataFrame(new_train, columns=colnames)
new_test = pd.DataFrame(new_test, columns=colnames)

In [16]:
print('new_train.shape: ',new_train.shape)
print('new_test.shape: ',new_test.shape)

new_train.shape:  (200000, 250)
new_test.shape:  (200000, 250)


In [17]:
config = {
    "seed":2019,
    "k_folds":5,
    "early_stopping_rounds":100
}

params = {
    "learning_rate": 0.1,
    "n_estimators": 10000,
    "max_depth": 3,
    "min_child_weight": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "colsample_bylevel": 0.8,
    "alpha": 0,
    "lambda": 10,
    "objective": "gpu:binary:logistic",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric":"auc"
}

In [18]:
folds = StratifiedKFold(n_splits=config['k_folds'], random_state=config['seed'], shuffle=True)

In [None]:
auc_list = list()
probs = np.zeros(len(test))

fold_score_df = pd.DataFrame
for i, (train_idx, valid_idx) in enumerate(folds.split(X=new_train, y=y_data)):
    print('='*25)
    print('{} fold'.format(i))
    x_train, y_train = new_train.iloc[train_idx, :], y_data.iloc[train_idx]
    x_valid, y_valid = new_train.iloc[valid_idx, :], y_data.iloc[valid_idx]

    watchlist = [(x_train,y_train),(x_valid, y_valid)]
    model = xgb.XGBClassifier(**params)
    model.fit(x_train, y_train,
            eval_set=watchlist,
            early_stopping_rounds=config['early_stopping_rounds'],
            verbose=100)

    val_prob = model.predict_proba(x_valid, ntree_limit=model.best_iteration)[:,1]
    prob = model.predict_proba(new_test, ntree_limit=model.best_iteration)[:,1]
    
    val_auc = metrics.roc_auc_score(y_valid, val_prob)
    auc_list.append(val_auc)
    print('val AUC: ',val_auc)

    probs = prob/folds.get_n_splits()
    
    score_df = pd.DataFrame()
    feature_score =  model.get_booster().get_score(importance_type='gain')
    score_df.loc[:,'feature'] = list(feature_score.keys())
    score_df.loc[:,'importance'] = list(feature_score.values())
    fold_score_df = pd.concat([fold_score_df, score_df])
    

print('='*100)
print('AUC_LIST')
print(auc_list)

print('-'*100)
print('Mean AUC: {}'.format(np.mean(auc_list)))

0 fold
