# Generate data from Single_xgb kernel for stacking
Single_xgb kernel:  https://www.kaggle.com/kueipo/base-on-froza-pascal-single-xgb-lb-0-284/code

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from multiprocessing import *
import gc
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb
#### Load Data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

### 
y = train['target'].values
testid= test['id'].values

train.drop(['id','target'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)


######################################## Feature engineering ##########################################

### Drop calc
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(unwanted, axis=1)  
test = test.drop(unwanted, axis=1)

### Reconstruct the feature 'ps_reg_03', discovered by Pascal https://www.kaggle.com/pnagel/reconstruction-of-ps-reg-03
def recon(reg):
    integer = int(np.round((40*reg)**2)) 
    for a in range(32):
        if (integer - a) % 31 == 0:
            A = a
    M = (integer - A)//31
    return A, M
train['ps_reg_A'] = train['ps_reg_03'].apply(lambda x: recon(x)[0])
train['ps_reg_M'] = train['ps_reg_03'].apply(lambda x: recon(x)[1])
train['ps_reg_A'].replace(19,-1, inplace=True)
train['ps_reg_M'].replace(51,-1, inplace=True)
test['ps_reg_A'] = test['ps_reg_03'].apply(lambda x: recon(x)[0])
test['ps_reg_M'] = test['ps_reg_03'].apply(lambda x: recon(x)[1])
test['ps_reg_A'].replace(19,-1, inplace=True)
test['ps_reg_M'].replace(51,-1, inplace=True)

In [6]:
train.columns.values

array(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02',
       'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
       'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat',
       'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat',
       'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15',
       'ps_reg_A', 'ps_reg_M'], dtype=object)

In [7]:
### Froza's baseline
### Froza's baseline

d_median = train.median(axis=0)
d_mean = train.mean(axis=0)
d_skew = train.skew(axis=0)
one_hot = {c: list(train[c].unique()) for c in train.columns if c not in ['id','target']}

def transform_df(df):
    df = pd.DataFrame(df)
    dcol = [c for c in df.columns if c not in ['id','target']]
    df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
    df['negative_one_vals'] = np.sum((df[dcol]==-1).values, axis=1)
    for c in dcol:
        if '_bin' not in c: #standard arithmetic
            df[c+str('_median_range')] = (df[c].values > d_median[c]).astype(np.int)
            df[c+str('_mean_range')] = (df[c].values > d_mean[c]).astype(np.int)

    for c in one_hot:
        if len(one_hot[c])>2 and len(one_hot[c]) < 7:
            for val in one_hot[c]:
                df[c+'_oh_' + str(val)] = (df[c].values == val).astype(np.int)
    return df

def multi_transform(df):
    print('Init Shape: ', df.shape)
    p = Pool(cpu_count())
    df = p.map(transform_df, np.array_split(df, cpu_count()))
    df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
    p.close(); p.join()
    print('After Shape: ', df.shape)
    return df

train = multi_transform(train)
test = multi_transform(test)

Init Shape:  (595212, 39)
After Shape:  (595212, 136)
Init Shape:  (892816, 39)
After Shape:  (892816, 136)


In [83]:
from sklearn import metrics
def ginic(actual, pred):
    actual = np.asarray(actual) 
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalized(a, p):
    if p.ndim == 2:
        p = p[:,1] 
    return ginic(a, p) / ginic(a, a)
    

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score
gini_sklearnf = metrics.make_scorer(gini_normalized, True, True)

In [64]:
print(type(train),type(test))

<class 'numpy.ndarray'> <class 'pandas.core.frame.DataFrame'>


# Use xgboost to stack

In [139]:
class Clf4Stack_xgb_2(object):
    def __init__(self, params, feval, verbose_eval, early_stopping_rounds=70, n_splits=5):
        self.params = params
        self.feval = feval
        self.verbose_eval = verbose_eval
        self.early_stopping_rounds = early_stopping_rounds
        self.n_splits = n_splits

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=99)

        for i, (train_index,test_index) in enumerate(skf.split(trainX, trainy)):
            print("=====Round {0}/{1}=====".format(i+1,self.n_splits))
            x_train, x_valid = trainX[train_index], trainX[test_index]
            y_train, y_valid = trainy[train_index], trainy[test_index]

            watchlist = [(xgb.DMatrix(x_train,y_train), 'train'), (xgb.DMatrix(x_valid, y_valid), 'valid')]
            model = xgb.train(self.params, xgb.DMatrix(x_train,y_train), 1100, watchlist, feval=self.feval,
                             verbose_eval=self.verbose_eval,maximize=True,
                             early_stopping_rounds=self.early_stopping_rounds)
            
            y_pred = model.predict(xgb.DMatrix(x_valid),ntree_limit=model.best_ntree_limit)
            self.train4stack[test_index] = y_pred
            self.test4stack += model.predict(xgb.DMatrix(testX),ntree_limit=model.best_ntree_limit)
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)

In [135]:
params = {'eta': 0.025, 'max_depth': 4, 
          'subsample': 0.9, 'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
            'min_child_weight':100,
            'alpha':4,
            'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}

In [141]:
C4S_2 = Clf4Stack_xgb_2(params,
                     gini_xgb, 
                     verbose_eval=50,
                    early_stopping_rounds=70,
                    n_splits=5)
C4S_2.fit_predict(train, y, test)



=====Round 1/5=====
[0]	train-gini:0.172903	valid-gini:0.163643
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[50]	train-gini:0.245237	valid-gini:0.235502
[100]	train-gini:0.254422	valid-gini:0.24268
[150]	train-gini:0.268719	valid-gini:0.256267
[200]	train-gini:0.280859	valid-gini:0.267175
[250]	train-gini:0.289594	valid-gini:0.273548
[300]	train-gini:0.296573	valid-gini:0.277897
[350]	train-gini:0.302025	valid-gini:0.280804
[400]	train-gini:0.306832	valid-gini:0.282987
[450]	train-gini:0.311262	valid-gini:0.28421
[500]	train-gini:0.315629	valid-gini:0.28546
[550]	train-gini:0.319885	valid-gini:0.286366
[600]	train-gini:0.323709	valid-gini:0.287508
[650]	train-gini:0.327197	valid-gini:0.288452
[700]	train-gini:0.330292	valid-gini:0.288608
[750]	train-gini:0.333387	valid-gini:0.288954
[800]	train-gini:0.336368	valid-gini:0.289356
[850]	train-gini:0.339448	valid-gini:0.289535
[900]	train-g

In [168]:
C4S_2.output()

In [169]:
df_test4stack = pd.read_csv('test4stack.csv')

In [170]:
df_test4stack.head()

Unnamed: 0,F4stack
0,0.025915
1,0.023498
2,0.025396
3,0.014122
4,0.033667


In [171]:
sub = pd.DataFrame()
sub['id'] = testid
sub['target'] =df_test4stack['F4stack']
sub.head()

Unnamed: 0,id,target
0,0,0.025915
1,1,0.023498
2,2,0.025396
3,3,0.014122
4,4,0.033667


In [172]:
sub.to_csv('xgb4stack.csv',index=False)

In [153]:
df_train4stack = pd.read_csv('train4stack.csv')
df_train4stack = np.array(df_train4stack)

In [78]:
sub = pd.DataFrame()
sub['id'] = testid
params = {'eta': 0.025, 'max_depth': 4, 
          'subsample': 0.9, 'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
            'min_child_weight':100,
            'alpha':4,
            'objective': 'binary:logistic', 'eval_metric': 'auc', 'seed': 99, 'silent': True}
x1, x2, y1, y2 = train_test_split(train, y, test_size=0.25, random_state=99)



watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 5000,  watchlist, feval=gini_xgb, maximize=True, 
                  verbose_eval=100, early_stopping_rounds=70)


sub['target'] = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)


[0]	train-gini:0.184662	valid-gini:0.177318
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-gini:0.256703	valid-gini:0.247427
[200]	train-gini:0.283564	valid-gini:0.262223
[300]	train-gini:0.299508	valid-gini:0.272256
[400]	train-gini:0.311043	valid-gini:0.27657
[500]	train-gini:0.319699	valid-gini:0.279318
[600]	train-gini:0.327429	valid-gini:0.280662
[700]	train-gini:0.334044	valid-gini:0.281427
[800]	train-gini:0.339485	valid-gini:0.281925
[900]	train-gini:0.344713	valid-gini:0.282627
[1000]	train-gini:0.350165	valid-gini:0.282678
Stopping. Best iteration:
[1007]	train-gini:0.350472	valid-gini:0.282758



# Use xgbclassifier to stack

In [166]:
class Clf4Stack_xgb(object):
    def __init__(self, model, metric, early_stopping_rounds=10, test_size=0.25, verbose=False, n_splits=5):
        self.n_splits = n_splits
        self.model = model
        self.metric = metric
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.verbose = verbose

    def fit_predict(self, trainX, trainy, testX):

        self.train4stack = np.zeros(len(trainX))
        self.test4stack = np.zeros(len(testX))

        skf = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=44)

        for i, (train_index,test_index) in enumerate(skf.split(trainX, trainy)):
            print("=====Round {0}/{1}=====".format(i+1,self.n_splits))
            X_train, X_test = trainX[train_index], trainX[test_index]
            y_train, y_test = trainy[train_index], trainy[test_index]

            x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=self.test_size, random_state=99)
            
            self.model.fit(x1, y1, 
                           eval_set=[(x1,y1),(x2,y2)], 
                           eval_metric=self.metric,
                           early_stopping_rounds=self.early_stopping_rounds,
                           verbose=self.verbose)
            
            y_pred = self.model.predict_proba(X_test,ntree_limit=self.model.best_ntree_limit)[:,1]
            self.train4stack[test_index] = y_pred
            self.test4stack += self.model.predict_proba(testX,ntree_limit=self.model.best_ntree_limit)[:,1]
        
        self.test4stack /= self.n_splits
            
    def output(self,train_file_name='train4stack.csv',
                    test_file_name='test4stack.csv',
                    col_name='F4stack'):

        pd.DataFrame({col_name:self.train4stack}).to_csv(train_file_name,index=False) 
        pd.DataFrame({col_name:self.test4stack}).to_csv(test_file_name,index=False)

In [167]:

from xgboost import XGBClassifier

params = {'learning_rate': 0.025, 
          'max_depth': 4, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':100,
          'objective': 'binary:logistic', 
          'seed': 99, 
          'silent': True}

xgb = XGBClassifier()
xgb.set_params(**params)

XGBClassifier(base_score=0.5, colsample_bylevel=0.7, colsample_bytree=0.7,
       gamma=0, learning_rate=0.025, max_delta_step=0, max_depth=4,
       min_child_weight=100, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=99, silent=True, subsample=0.9)

In [175]:
C4S = Clf4Stack_xgb(xgb, 
                    gini_xgb, 
                    early_stopping_rounds=10, 
                    test_size=0.25, 
                    verbose=50, 
                    n_splits=5)

In [180]:
C4S.fit_predict(train, y, test)

=====Round 1/5=====
[0]	validation_0-gini:0.195231	validation_1-gini:0.174655
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 10 rounds.
Stopping. Best iteration:
[0]	validation_0-gini:0.195231	validation_1-gini:0.174655

=====Round 2/5=====
[0]	validation_0-gini:0.190588	validation_1-gini:0.199714
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 10 rounds.
Stopping. Best iteration:
[0]	validation_0-gini:0.190588	validation_1-gini:0.199714

=====Round 3/5=====
[0]	validation_0-gini:0.18936	validation_1-gini:0.165823
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 10 rounds.
Stopping. Best iteration:
[0]	validation_0-gini:0.18936	validation_1-gini:0.165823

=====Round 4/5=====
[0]	validation_0