Based on [olivier's script](https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283)

In [1]:
EARLY_STOPPING_ROUNDS = 50  
# Note: I set EARLY_STOPPING_ROUNDS high so that (when OPTIMIZE_ROUNDS is set)
#       I will get lots of information to make my own judgment.  You should probably
#       reduce EARLY_STOPPING_ROUNDS if you want to do actual early stopping.

I recommend initially setting <code>MAX_ROUNDS</code> fairly high and using <code>OPTIMIZE_ROUNDS</code> to get an idea of the appropriate number of rounds (which, in my judgment, should be close to the maximum value of  <code>best_ntree_limit</code> among all folds, maybe even a bit higher if your model is adequately regularized...or alternatively, you could set <code>verbose=True</code> and look at the details to try to find a number of rounds that works well for all folds).  Then I would turn off <code>OPTIMIZE_ROUNDS</code> and set <code>MAX_ROUNDS</code> to the appropraite number of total rounds.  

The problem with "early stopping" by choosing the best round for each fold is that it overfits to the validation data.    It's therefore liable not to produce the optimal model for predicting test data, and if it's used to produce validation data for stacking/ensembling with other models, it would cause this one to have too much weight in the ensemble.  Another possibility (and the default for XGBoost, it seems) is to use the round where the early stop actually happens (with the lag that verifies lack of improvement) rather than the best round.  That solves the overfitting problem (provided the lag is long enough), but so far it doesn't seem to have helped.  (I got a worse validation score with 20-round early stopping per fold than with a constant number of rounds for all folds, so the early stopping actually seemed to underfit.)


In [58]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from numba import jit
import time
import gc

In [3]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [104]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [5]:
def getData():
        # from olivier
    train_features = [
        "ps_car_13",  #            : 1571.65 / shadow  609.23
        "ps_reg_03",  #            : 1408.42 / shadow  511.15
        "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
        "ps_ind_03",  #            : 1219.47 / shadow  230.55
        "ps_ind_15",  #            :  922.18 / shadow  242.00
        "ps_reg_02",  #            :  920.65 / shadow  267.50
        "ps_car_14",  #            :  798.48 / shadow  549.58
        "ps_car_12",  #            :  731.93 / shadow  293.62
        "ps_car_01_cat",  #        :  698.07 / shadow  178.72
        "ps_car_07_cat",  #        :  694.53 / shadow   36.35
        "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
        "ps_car_03_cat",  #        :  611.73 / shadow   50.67
        "ps_reg_01",  #            :  598.60 / shadow  178.57
        "ps_car_15",  #            :  593.35 / shadow  226.43
        "ps_ind_01",  #            :  547.32 / shadow  154.58
        "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
        "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
        "ps_car_06_cat",  #        :  398.02 / shadow  212.43
        "ps_car_04_cat",  #        :  376.87 / shadow   76.98
        "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
        "ps_car_09_cat",  #        :  214.12 / shadow   81.38
        "ps_car_02_cat",  #        :  203.03 / shadow   26.67
        "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
        "ps_car_11",  #            :  173.28 / shadow   76.45
        "ps_car_05_cat",  #        :  172.75 / shadow   62.92
        "ps_calc_09",  #           :  169.13 / shadow  129.72
        "ps_calc_05",  #           :  148.83 / shadow  120.68
        "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
        "ps_car_08_cat",  #        :  120.87 / shadow   28.82
        "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
        "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
        "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
        "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
        "ps_ind_14",  #            :   37.37 / shadow   16.65
    ]
    
    # Read data
    train_df = pd.read_csv('./data/train.csv', na_values="-1", index_col='id') # .iloc[0:200,:]
    test_df = pd.read_csv('./data/test.csv', na_values="-1", index_col='id')
    
    train_df = train_df.loc[:, train_features + ["target"]]
    test_df  = test_df.loc[:,train_features]
    return [train_df, test_df]

In [6]:
[train_df, test_df] = getData()

In [7]:
train_df.head()

Unnamed: 0_level_0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,ps_calc_09,ps_calc_05,ps_ind_08_bin,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.883679,0.71807,0.0,5,11,0.2,0.37081,0.4,10.0,1.0,...,1,1,0,0,0,1.0,0,0,0,0
9,0.618817,0.766078,0.0,7,3,0.4,0.388716,0.316228,11.0,1.0,...,1,1,1,1,0,0.0,1,0,0,0
13,0.641586,,0.0,9,12,0.0,0.347275,0.316228,7.0,1.0,...,2,2,1,1,0,1.0,0,0,0,0
16,0.542949,0.580948,0.0,2,8,0.2,0.294958,0.374166,7.0,1.0,...,4,4,0,1,0,0.0,0,0,0,0
17,0.565832,0.840759,0.0,0,9,0.6,0.365103,0.31607,11.0,1.0,...,2,2,0,1,0,1.0,0,0,0,0


In [8]:
test_df.head()

Unnamed: 0_level_0,ps_car_13,ps_reg_03,ps_ind_05_cat,ps_ind_03,ps_ind_15,ps_reg_02,ps_car_14,ps_car_12,ps_car_01_cat,ps_car_07_cat,...,ps_car_05_cat,ps_calc_09,ps_calc_05,ps_ind_08_bin,ps_car_08_cat,ps_ind_09_bin,ps_ind_04_cat,ps_ind_18_bin,ps_ind_12_bin,ps_ind_14
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.669556,0.610328,0.0,8,12,0.3,0.352136,0.316228,7.0,1.0,...,,2,1,0,1,0,1.0,0,0,0
1,0.60632,0.771362,0.0,5,5,0.5,0.358329,0.316228,4.0,1.0,...,0.0,2,3,0,1,1,1.0,0,0,0
2,0.896239,0.916174,0.0,3,10,0.0,0.398497,0.4,11.0,1.0,...,,3,3,0,1,1,0.0,0,0,0
3,0.65211,,0.0,6,4,0.2,0.381445,0.374166,7.0,1.0,...,,1,1,0,1,0,0.0,0,0,0
4,0.812914,0.817771,0.0,7,4,0.4,0.385097,0.374166,11.0,1.0,...,,4,4,0,1,1,0.0,0,0,0


In [9]:
class featureCombination:
    """
    Class provide usual "fit-tranform" functionality for feature combinations 
    """
    def __init__(self, combinationList):
        self.comninationList = combinationList
        self.encoders = dict()
    
    def fit_transform(self,df):
        X = df.copy() 
        
        for n_c, (f1, f2) in enumerate(self.comninationList):
            fName = f1 + "_plus_" + f2
            # add new feature
            X.insert(X.shape[1], fName, X[f1].astype('str')+"_"+X[f2].astype('str'))
                     
            # encode new feature         
            lbl = LabelEncoder()
            X[fName] = lbl.fit_transform(X[fName])
            self.encoders.update({fName:lbl})
               
        return X
    
    def transform(self,df):
        X = df.copy() 
        
        for n_c, (f1, f2) in enumerate(self.comninationList):
            fName = f1 + "_plus_" + f2
            # add new feature
            X.insert(X.shape[1], fName, X[f1].astype('str')+"_"+X[f2].astype('str'))
                     
            # encode new feature         
            lbl = self.encoders.get(fName)
            X[fName] = lbl.fit_transform(X[fName])
            self.encoders.update({fName:lbl})
               
        return X

In [10]:
# Process data
#id_test = test_df['id'].values
#id_train = train_df['id'].values
#y = train_df['target']


# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat')]

fComb = featureCombination(combs)

train_df = fComb.fit_transform(train_df)
test_df = fComb.transform(test_df)

In [14]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [15]:
OPTIMIZE_ROUNDS = True

In [16]:
f_cats = [x for x in test_df.columns if x.endswith('_cat')]

In [28]:
xgbParams = {'n_estimators': 500,
             'max_depth': 4,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma':10,
             'reg_alpha':8,
             'reg_lambda':1.3}

In [29]:
# Set up classifier
def getXGBModel(params,seed = 43):
    return XGBClassifier(seed = seed, **params)

In [31]:
getXGBModel(xgbParams, 0)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=10, learning_rate=0.07, max_delta_step=0, max_depth=4,
       min_child_weight=6, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=8, reg_lambda=1.3,
       scale_pos_weight=1.6, seed=0, silent=True, subsample=0.8)

In [18]:
def submit(pred, ind, fName = 'submission.csv'):
    """
    Write submission file.
    params:
        pred - predictions (array)
        ind - index for prediction (array)
        fName - name of file (string)
    """

    df = pd.DataFrame(pred,columns=['target'],index=ind)
    df.to_csv(fName,index_label='id',float_format='%.6f')

In [23]:
xgbParams = {'n_estimators': 500,
             'max_depth': 4,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma':10,
             'reg_alpha':8,
             'reg_lambda':1.3}

# Run CV
localScore = []



for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    print "\nFold ", i
    
    # Create data for this fold
    X_train = train_df.drop("target", axis= 1).iloc[train_index,:]
    X_valid = train_df.drop("target", axis= 1).iloc[test_index,:]
    y_train = train_df.target.iloc[train_index]
    y_valid = train_df.target.iloc[test_index]
    X_test = test_df.copy()
        
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )

    model = getXGBModel(i)
    
    model.fit( X_train, y_train, 
               eval_set= [(X_valid,y_valid)],
               eval_metric= 'auc',
               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
               verbose=False)

   
        
    # Generate validation predictions for this fold
    y_valid_pred.iloc[test_index] = model.predict_proba(X_valid)[:,1]
    ls = 2*roc_auc_score(y_valid,y_valid_pred.iloc[test_index])-1
    localScore.append(ls)
    
    print "  Best N trees = ", model.best_ntree_limit
    print "  Best ROC-AUC = ", 2*model.best_score-1
    print 'Validation score is %f'%ls
    
    # Accumulate test set predictions
    y_test_pred += model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions
localScore =np.array(localScore)

print "\nGini for full training set: %f"%(2*roc_auc_score(train_df.target, y_valid_pred)-1)

print "\nLocal score is%f, std is %f"%(localScore.mean(), localScore.std())


Fold  0
  Best N trees =  135
  Best ROC-AUC =  0.28454
Validation score is 0.283673

Fold  1
  Best N trees =  186
  Best ROC-AUC =  0.282074
Validation score is 0.281578

Fold  2
  Best N trees =  229
  Best ROC-AUC =  0.276004
Validation score is 0.275151

Fold  3
  Best N trees =  294
  Best ROC-AUC =  0.299516
Validation score is 0.298347

Fold  4
  Best N trees =  217
  Best ROC-AUC =  0.286548
Validation score is 0.286207

Gini for full training set: 0.284829

Local score is0.284991, std is 0.007617


In [24]:
submit(pred = y_test_pred, ind = test_df.index, fName= 'xgb4.csv')

xgbParams = {'n_estimators': 500,
             'max_depth': 6,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'Subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma':10,
             'reg_alpha':8,
             'reg_lambda':1.3, 
              seed = seed}
              
LB score 0.282
local score 0.284829   

In [105]:
def trainXGB(params, varSeed = True, folds = 5):
    # Run CV
    localScore = []
    y_valid_pred = 0*train_df.target
    y_test_pred  = 0
    
    
    if folds == 1:
        X_train = train_df.drop("target", axis= 1).copy()
        y_train = train_df.target
        X_test = test_df.copy()
        
        # Encode data
        for f in f_cats:
            X_train[f + "_avg"], _, X_test[f + "_avg"] = target_encode(
                                                            trn_series=X_train[f],
                                                            tst_series=X_test[f],
                                                            target=y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            noise_level=0
                                                            )
            
        model = getXGBModel(params, i if varSeed else 0)
        model.fit(X_train, y_train, eval_metric= 'auc', verbose= True)
        y_test_pred = model.predict_proba(X_test)[:,1]
        localScore = 0
        
        
    else:    
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
        for i, (train_index, test_index) in enumerate(skf.split(train_df.drop("target", axis= 1), train_df.target)):

            print "\nFold ", i

            # Create data for this fold
            X_train = train_df.drop("target", axis= 1).iloc[train_index,:]
            X_valid = train_df.drop("target", axis= 1).iloc[test_index,:]
            y_train = train_df.target.iloc[train_index]
            y_valid = train_df.target.iloc[test_index]
            X_test = test_df.copy()

            # Encode data
            for f in f_cats:
                X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                                trn_series=X_train[f],
                                                                val_series=X_valid[f],
                                                                tst_series=X_test[f],
                                                                target=y_train,
                                                                min_samples_leaf=200,
                                                                smoothing=10,
                                                                noise_level=0
                                                                )

            model = getXGBModel(params, i if varSeed else 0)

            model.fit( X_train, y_train, 
                       eval_set= [(X_valid,y_valid)],
                       eval_metric= 'auc',
                       early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                       verbose=False)



            # Generate validation predictions for this fold
            y_valid_pred.iloc[test_index] = model.predict_proba(X_valid)[:,1]
            ls = 2*roc_auc_score(y_valid,y_valid_pred.iloc[test_index])-1
            localScore.append(ls)

            print "  Best N trees = ", model.best_ntree_limit
            print "  Best ROC-AUC = ", 2*model.best_score-1
            print "  Validation score is %f"%ls
            print "  Gini score is %f"%gini_normalized(y_valid,y_valid_pred.iloc[test_index])

            # Accumulate test set predictions
            y_test_pred += model.predict_proba(X_test)[:,1]

            del X_test, X_train, X_valid, y_train

        y_test_pred /= K  # Average test set predictions
        localScore =np.array(localScore)

        print "\nGini for full training set (AUC): %f"%(2*roc_auc_score(train_df.target, y_valid_pred)-1)
        print "\nGini for full training set: %f"% gini_normalized(train_df.target, y_valid_pred)
        
        print "\nLocal score is%f, std is %f"%(localScore.mean(), localScore.std())
        
    return {'test_prediction':y_test_pred, 'localScore':localScore}

In [106]:
# Functions from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(series.shape[0]))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
    temp = pd.concat([trn_series, target], axis=1)
    
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    
    if not (val_series is None):
        ft_val_series = pd.merge(
            val_series.to_frame(val_series.name),
            averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
            on=val_series.name,
            how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
        ft_val_series.index = val_series.index
    else:
        ft_val_series = np.zeros([1])
        
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)


In [2]:
class targetEncode():
    
    def __init__(self,features, min_samples_leaf=1, smoothing=1, noise_level=0):
        self.features = features
        self.min_samples_leaf = 1
        self.smoothing = smoothing
        self.noise_level = noise_level
        self.encoders = list()
        
    def fit_transform(self, df, target):
        """
        Fit to feature list and transform them
        """
        pass

In [3]:
c = targetEncode(['a','b','c'])
c

<__main__.targetEncode instance at 0x7f1788ed94d0>

In [43]:
xgbParams2 = {'n_estimators': 2000,
             'max_depth': 4,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma':10,
             'reg_alpha':8,
             'reg_lambda':1.3}

In [44]:
model2 = trainXGB(xgbParams2)


Fold  0
  Best N trees =  402
  Best ROC-AUC =  0.28698
Validation score is 0.286129

Fold  1
  Best N trees =  528
  Best ROC-AUC =  0.281884
Validation score is 0.281567

Fold  2
  Best N trees =  388
  Best ROC-AUC =  0.276336
Validation score is 0.276192

Fold  3
  Best N trees =  257
  Best ROC-AUC =  0.29802
Validation score is 0.297513

Fold  4
  Best N trees =  380
  Best ROC-AUC =  0.284062
Validation score is 0.283714

Gini for full training set: 0.284735

Local score is0.285023, std is 0.007055


In [46]:
submit(pred = model2['test_prediction'], ind = test_df.index, fName= 'xgb5.csv')

LB score is 0.282

In [55]:
model3 = trainXGB(xgbParams2, varSeed= False)


Fold  0
  Best N trees =  402
  Best ROC-AUC =  0.28698
Validation score is 0.286129

Fold  1
  Best N trees =  337
  Best ROC-AUC =  0.283558
Validation score is 0.283459

Fold  2
  Best N trees =  417
  Best ROC-AUC =  0.277246
Validation score is 0.276661

Fold  3
  Best N trees =  480
  Best ROC-AUC =  0.301944
Validation score is 0.301707

Fold  4
  Best N trees =  424
  Best ROC-AUC =  0.28714
Validation score is 0.286832

Gini for full training set: 0.286757

Local score is0.286957, std is 0.008204


In [56]:
submit(pred = model3['test_prediction'], ind = test_df.index, fName= 'xgb5.csv')

LB score is 0.283

Add stratified K fold

In [64]:
model4 = trainXGB(xgbParams2, varSeed= False)


Fold  0
  Best N trees =  415
  Best ROC-AUC =  0.270144
Validation score is 0.270101

Fold  1
  Best N trees =  326
  Best ROC-AUC =  0.294042
Validation score is 0.293594

Fold  2
  Best N trees =  292
  Best ROC-AUC =  0.285326
Validation score is 0.284719

Fold  3
  Best N trees =  518
  Best ROC-AUC =  0.29544
Validation score is 0.294903

Fold  4
  Best N trees =  233
  Best ROC-AUC =  0.288662
Validation score is 0.288352

Gini for full training set: 0.286299

Local score is0.286334, std is 0.008903


In [90]:
xgbParams3 = {'n_estimators': 400,
             'max_depth': 4,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma': 10,
             'reg_alpha': 8,
             'reg_lambda': 1.3}

c = trainXGB(xgbParams3, varSeed= False, folds= 1)

In [93]:
submit(pred = c['test_prediction'], ind = test_df.index, fName= 'xgb6.csv')

LB score is  0.280

In [101]:
model4 = trainXGB(xgbParams2, varSeed= False)


Fold  0
  Best N trees =  415
  Best ROC-AUC =  0.270144
Validation score is 0.270101

Fold  1
  Best N trees =  326
  Best ROC-AUC =  0.294042
Validation score is 0.293594

Fold  2
  Best N trees =  292
  Best ROC-AUC =  0.285326
Validation score is 0.284719

Fold  3
  Best N trees =  518
  Best ROC-AUC =  0.29544
Validation score is 0.294903

Fold  4
  Best N trees =  233
  Best ROC-AUC =  0.288662
Validation score is 0.288352

Gini for full training set: 0.286299

Local score is0.286334, std is 0.008903


In [107]:
model4 = trainXGB(xgbParams2, varSeed= False)


Fold  0
  Best N trees =  415
  Best ROC-AUC =  0.270144
  Validation score is 0.270101
  Gini score is 0.270101

Fold  1
  Best N trees =  326
  Best ROC-AUC =  0.294042
  Validation score is 0.293594
  Gini score is 0.293594

Fold  2
  Best N trees =  292
  Best ROC-AUC =  0.285326
  Validation score is 0.284719
  Gini score is 0.284719

Fold  3
  Best N trees =  518
  Best ROC-AUC =  0.29544
  Validation score is 0.294903
  Gini score is 0.294903

Fold  4
  Best N trees =  233
  Best ROC-AUC =  0.288662
  Validation score is 0.288352
  Gini score is 0.288352

Gini for full training set (AUC): 0.286299

Gini for full training set: 0.286299

Local score is0.286334, std is 0.008903
