Based on [olivier's script](https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283)

In [1]:
EARLY_STOPPING_ROUNDS = 50  
# Note: I set EARLY_STOPPING_ROUNDS high so that (when OPTIMIZE_ROUNDS is set)
#       I will get lots of information to make my own judgment.  You should probably
#       reduce EARLY_STOPPING_ROUNDS if you want to do actual early stopping.

I recommend initially setting <code>MAX_ROUNDS</code> fairly high and using <code>OPTIMIZE_ROUNDS</code> to get an idea of the appropriate number of rounds (which, in my judgment, should be close to the maximum value of  <code>best_ntree_limit</code> among all folds, maybe even a bit higher if your model is adequately regularized...or alternatively, you could set <code>verbose=True</code> and look at the details to try to find a number of rounds that works well for all folds).  Then I would turn off <code>OPTIMIZE_ROUNDS</code> and set <code>MAX_ROUNDS</code> to the appropraite number of total rounds.  

The problem with "early stopping" by choosing the best round for each fold is that it overfits to the validation data.    It's therefore liable not to produce the optimal model for predicting test data, and if it's used to produce validation data for stacking/ensembling with other models, it would cause this one to have too much weight in the ensemble.  Another possibility (and the default for XGBoost, it seems) is to use the round where the early stop actually happens (with the lag that verifies lack of improvement) rather than the best round.  That solves the overfitting problem (provided the lag is long enough), but so far it doesn't seem to have helped.  (I got a worse validation score with 20-round early stopping per fold than with a constant number of rounds for all folds, so the early stopping actually seemed to underfit.)


In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from numba import jit
import time
import gc



In [3]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [4]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [14]:
# Functions from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(series.shape[0]))

In [5]:
def getData():
        # from olivier
    train_features = [
        "ps_car_13",  #            : 1571.65 / shadow  609.23
        "ps_reg_03",  #            : 1408.42 / shadow  511.15
        "ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
        "ps_ind_03",  #            : 1219.47 / shadow  230.55
        "ps_ind_15",  #            :  922.18 / shadow  242.00
        "ps_reg_02",  #            :  920.65 / shadow  267.50
        "ps_car_14",  #            :  798.48 / shadow  549.58
        "ps_car_12",  #            :  731.93 / shadow  293.62
        "ps_car_01_cat",  #        :  698.07 / shadow  178.72
        "ps_car_07_cat",  #        :  694.53 / shadow   36.35
        "ps_ind_17_bin",  #        :  620.77 / shadow   23.15
        "ps_car_03_cat",  #        :  611.73 / shadow   50.67
        "ps_reg_01",  #            :  598.60 / shadow  178.57
        "ps_car_15",  #            :  593.35 / shadow  226.43
        "ps_ind_01",  #            :  547.32 / shadow  154.58
        "ps_ind_16_bin",  #        :  475.37 / shadow   34.17
        "ps_ind_07_bin",  #        :  435.28 / shadow   28.92
        "ps_car_06_cat",  #        :  398.02 / shadow  212.43
        "ps_car_04_cat",  #        :  376.87 / shadow   76.98
        "ps_ind_06_bin",  #        :  370.97 / shadow   36.13
        "ps_car_09_cat",  #        :  214.12 / shadow   81.38
        "ps_car_02_cat",  #        :  203.03 / shadow   26.67
        "ps_ind_02_cat",  #        :  189.47 / shadow   65.68
        "ps_car_11",  #            :  173.28 / shadow   76.45
        "ps_car_05_cat",  #        :  172.75 / shadow   62.92
        "ps_calc_09",  #           :  169.13 / shadow  129.72
        "ps_calc_05",  #           :  148.83 / shadow  120.68
        "ps_ind_08_bin",  #        :  140.73 / shadow   27.63
        "ps_car_08_cat",  #        :  120.87 / shadow   28.82
        "ps_ind_09_bin",  #        :  113.92 / shadow   27.05
        "ps_ind_04_cat",  #        :  107.27 / shadow   37.43
        "ps_ind_18_bin",  #        :   77.42 / shadow   25.97
        "ps_ind_12_bin",  #        :   39.67 / shadow   15.52
        "ps_ind_14",  #            :   37.37 / shadow   16.65
    ]
    
    # Read data
    train_df = pd.read_csv('../data/train.csv', na_values="-1", index_col='id') # .iloc[0:200,:]
    test_df = pd.read_csv('../data/test.csv', na_values="-1", index_col='id')
    
    train_df = train_df.loc[:, train_features + ["target"]]
    test_df  = test_df.loc[:,train_features]
    return [train_df, test_df]

In [6]:
[train_df, test_df] = getData()

In [7]:
class featureCombination:
    """
    Class provide usual "fit-tranform" functionality for feature combinations 
    """
    def __init__(self, combinationList):
        self.comninationList = combinationList
        self.encoders = dict()
    
    def fit_transform(self,df):
        X = df.copy() 
        
        for n_c, (f1, f2) in enumerate(self.comninationList):
            fName = f1 + "_plus_" + f2
            # add new feature
            X.insert(X.shape[1], fName, X[f1].astype('str')+"_"+X[f2].astype('str'))
                     
            # encode new feature         
            lbl = LabelEncoder()
            X[fName] = lbl.fit_transform(X[fName])
            self.encoders.update({fName:lbl})
               
        return X
    
    def transform(self,df):
        X = df.copy() 
        
        for n_c, (f1, f2) in enumerate(self.comninationList):
            fName = f1 + "_plus_" + f2
            # add new feature
            X.insert(X.shape[1], fName, X[f1].astype('str')+"_"+X[f2].astype('str'))
                     
            # encode new feature         
            lbl = self.encoders.get(fName)
            X[fName] = lbl.fit_transform(X[fName])
            self.encoders.update({fName:lbl})
               
        return X

In [27]:
class targetEncode():
    
    
    def __init__(self, features, min_samples_leaf=1, smoothing=1, fillna = True):
        self.suffix = '_tarEncd'
        
        self.features = features
        
        self.min_samples_leaf = 1
        self.smoothing = smoothing
        
        self.encoders = dict()
        self.prior = 1.0
        self.fillna = fillna
        
        
    def fit_transform(self, inpDf, targetColumn = 'target'):
        """
        Fit to features and transform them
        params:
            df - dataframe to process
            targetColumn  - name of columns which will be used as a target variable 
        """

        for colName in self.features:
            
            if not (colName in inpDf.columns):
                raise Exception('Columns name mismatch', 'Column %s was not found in dataframe'%colName)
            
            self.prior = inpDf[targetColumn].mean()
            averages = inpDf.groupby(by= colName)[targetColumn].agg(["mean", "count"])
            smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))
            averages['target'] = self.prior * (1 - smoothing) + averages["mean"] * smoothing
            self.encoders.update({colName: averages.to_dict(orient= 'dict')['target']})
            
        return self.transform(inpDf)
    
    def transform(self, inpDf):
        """
        Transform features
        params:
            df - dataframe to process
        """
        
        df = inpDf.copy()    
        for colName in self.features:
            if colName+self.suffix in df.columns:
                df.drop(colName+self.suffix, axis = 1, inplace= True)
            
            ts = df[colName].map(self.encoders[colName], na_action= 'ignore')
            
            if self.fillna:
                ts.fillna(self.prior, inplace = True)
            
            df.insert(df.shape[1], colName+self.suffix, ts)
        return df    

In [9]:
# Set up classifier
def getXGBModel(params,seed = 43):
    return XGBClassifier(seed = seed, **params)

In [10]:
# add combinations
combs = [('ps_reg_01', 'ps_car_02_cat'),  
         ('ps_reg_01', 'ps_car_04_cat')]

fComb = featureCombination(combs)

train_df = fComb.fit_transform(train_df)
test_df = fComb.transform(test_df)

In [11]:
OPTIMIZE_ROUNDS = True

In [12]:
f_cats = [x for x in test_df.columns if x.endswith('_cat')]

In [13]:
def submit(predDf, fName = 'submission.csv'):
    """
    Write submission file.
    params:
        pred - predictions (array)
        ind - index for prediction (array)
        fName - name of file (string)
    """
    predDf.rename(columns={'prediction':'target'},inplace=True)
    predDf.to_csv(fName,index_label='id',float_format='%.6f')

In [43]:
def trainXGB(params, varSeed= True, folds= 5, encode_target= True):
    # Run CV
    localScore = []
    
    valDf = pd.DataFrame(0, index = train_df.index, columns = ['fold','prediction'])
    testDf = pd.DataFrame(0, index = test_df.index, columns = ['prediction'])

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
    for i, (train_index, test_index) in enumerate(skf.split(train_df.drop("target", axis= 1), train_df.target)):

        print "\nFold ", i

        # Create data for this fold
        if encode_target:
            te = targetEncode(features= f_cats, min_samples_leaf= 200, smoothing= 100)
            X_train = te.fit_transform(train_df.iloc[train_index,:])
            X_valid = te.transform(train_df.iloc[test_index,:])
            X_test  = te.transform(test_df)
        else:
            X_train = train_df.iloc[train_index,:]
            X_valid = train_df.iloc[test_index,:]
            X_test  = test_df

        model = getXGBModel(params, i if varSeed else 0)
        model.fit(X_train.drop("target", axis= 1), X_train.target, 
                   eval_set= [(X_valid.drop("target", axis= 1), X_valid.target)],
                   eval_metric= 'auc',
                   early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                   verbose=False)

        # Generate validation predictions for this fold
        pred = model.predict_proba(X_valid.drop("target", axis= 1))[:,1]
        valDf.prediction.iloc[test_index] = pred
        valDf.fold.iloc[test_index] = i

        ls = 2*roc_auc_score(X_valid.target, pred)-1
        localScore.append(ls)

        print "  Best N trees = ", model.best_ntree_limit
        print "  Best ROC-AUC = ", 2*model.best_score-1
        print "  Validation score is %f"%ls

        testDf['prediction']+= model.predict_proba(X_test)[:,1]

    testDf['prediction'] /= folds  # Average test set predictions
    localScore =np.array(localScore)

    print "\nGini for full training set (AUC): %f"%(2*roc_auc_score(train_df.target, valDf.prediction)-1)
    print "\nLocal score is %f, std is %f"%(localScore.mean(), localScore.std())
        
    return {'test_prediction':testDf, 'localScore':localScore, 'train_prediction': valDf}

In [41]:
xgbParams = {'n_estimators': 1000,
             'max_depth': 4,
             'objective': "binary:logistic",
             'learning_rate': 0.07,
             'subsample': 0.8,
             'min_child_weight': 6,
             'colsample_bytree': .8,
             'scale_pos_weight': 1.6,
             'gamma': 10,
             'reg_alpha': 8,
             'reg_lambda': 1.3}

In [44]:
model1 = trainXGB(xgbParams, encode_target= True)


Fold  0
  Best N trees =  482
  Best ROC-AUC =  0.270762
  Validation score is 0.270367

Fold  1
  Best N trees =  356
  Best ROC-AUC =  0.294946
  Validation score is 0.294701

Fold  2
  Best N trees =  226
  Best ROC-AUC =  0.28695
  Validation score is 0.286391

Fold  3
  Best N trees =  462
  Best ROC-AUC =  0.296286
  Validation score is 0.296028

Fold  4
  Best N trees =  284
  Best ROC-AUC =  0.28902
  Validation score is 0.288700

Gini for full training set (AUC): 0.287122

Local score is 0.287237, std is 0.009170


In [45]:
model2 = trainXGB(xgbParams, encode_target= False)


Fold  0
  Best N trees =  567
  Best ROC-AUC =  0.2709
  Validation score is 0.270641

Fold  1
  Best N trees =  324
  Best ROC-AUC =  0.29263
  Validation score is 0.292186

Fold  2
  Best N trees =  383
  Best ROC-AUC =  0.286054
  Validation score is 0.285971

Fold  3
  Best N trees =  302
  Best ROC-AUC =  0.290518
  Validation score is 0.290311

Fold  4
  Best N trees =  178
  Best ROC-AUC =  0.28382
  Validation score is 0.283716

Gini for full training set (AUC): 0.284444

Local score is 0.284565, std is 0.007585


In [47]:
model2['test_prediction']

Unnamed: 0_level_0,prediction
id,Unnamed: 1_level_1
0,0.042243
1,0.043655
2,0.038021
3,0.022985
4,0.055887
5,0.068505
6,0.024037
8,0.055165
10,0.077307
11,0.090465


In [48]:
submit(model2['test_prediction'],'xgb5.csv')

0.282

In [55]:
submit(model1['test_prediction'],'xgb5.csv')

0.281

In [56]:
summ = 0.5*(model1['test_prediction']+model2['test_prediction'])

In [57]:
submit(summ,'xgb6.csv')

In [58]:
0.283

0.283

In [60]:
import joblib

In [61]:
joblib.dump(model1,'xgbModel1.dmp')

['xgbModel1.dmp']

In [62]:
joblib.dump(model2,'xgbModel2.dmp')

['xgbModel2.dmp']