In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from pandas.core.common import array_equivalent
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import train_test_split

# Data Exploration

Let us have a peek at some of this data!



In [2]:
idCol = "ID"
targetCol = "TARGET"

testData = pd.read_csv('data/test.csv')
trainData = pd.read_csv('data/train.csv')

In [3]:
print 'Number of training rows:', len(trainData)

trainData.head()

Number of training rows: 76020


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [4]:
print 'Number of test rows:', len(testData)

testData.head()

Number of test rows: 75818


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40532.1
1,5,2,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45486.72
2,6,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46993.95
3,7,2,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,187898.61
4,9,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,73649.73


Seeing lots of zeros/similarity in the columns above. Let's check the variance of data in these columns and remove the redundant ones, if any. 

In [5]:
colCount = trainData.shape[1]

for column in trainData:
    
    if trainData[column].std() == 0: # or len(pd.unique(..)) < 2
        trainData.drop(column, axis=1, inplace=True)
        testData.drop(column, axis=1, inplace=True)
        
print colCount - trainData.shape[1], 'columns removed from test/train data.'

34 columns removed from test/train data.


Let's remove any duplicate columns as well.

In [6]:
# stackoverflow.com/questions/python-pandas-remove-duplicate-columns

def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

In [7]:
colCount = trainData.shape[1]

dupCols = duplicate_columns(trainData)
trainData.drop(dupCols, axis=1, inplace=True)
testData.drop(dupCols, axis=1, inplace=True)

print colCount - trainData.shape[1], 'columns removed from test/train data.'

29 columns removed from test/train data.


# Training

### XGB

In [None]:
folds = 5
maxRounds = 50

X = trainData.drop([idCol, targetCol], axis=1)
Y = trainData[targetCol]

In [None]:
# Tuning hyperparams using GridSearch.

clf = XGBClassifier()

params = {'objective':['binary:logistic'],
          'learning_rate': [0.01,0.02,0.03], 
          'max_depth': [5,6,7], 
          'min_child_weight': [5,6,7],
          'subsample': [0.8],
          'colsample_bytree': np.linspace(0.5,0.7,5).tolist(),
          'n_estimators': [1000, 5000], 
          'seed': [27],
          'reg_lambda':[1e-5],
          'missing': [9999999999]}

gs = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=-2,verbose=1, refit=True, cv=folds)

gs.fit(X, Y)

best_parameters, score, _ = max(gs.grid_scores_, key=lambda x: x[1])
print('Raw AUC score:', score)
for p in sorted(best_parameters.keys()):
    print("%s: %r" % (p, best_parameters[p]))
    
print('Overall AUC:', roc_auc_score(Y, gs.predict_proba(X)[:,1]))

In [None]:
# Tuning hyperparams using HyperOpt (GridSearch alt).
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

xTrain, xValid, yTrain, yValid = train_test_split(X, Y, test_size=0.2, random_state=37)

def objective(space):

    clf = XGBClassifier(n_estimators = int(space['n_estimators']), 
                        objective = 'binary:logistic',
                        seed=27,
                        learning_rate = space['learning_rate'],
                        max_depth = space['max_depth'],
                        min_child_weight = space['min_child_weight'],
                        colsample_bytree = space['colsample_bytree'],
                        subsample = space['subsample'])

    eval_set  = [( xTrain, yTrain), ( xValid, yValid)]

    clf.fit(xTrain, yTrain,
            eval_set=eval_set, eval_metric="auc", 
            early_stopping_rounds=maxRounds,verbose=False)

    pred = clf.predict_proba(xValid)[:,1]
    auc = roc_auc_score(yValid, pred)
    print "SCORE:", auc

    return{'loss':1-auc, 'status': STATUS_OK }


space = {
         'n_estimators' : hp.quniform('n_estimators', 100, 10000, 100),
         'learning_rate' : hp.quniform('learning_rate', 0.02, 0.05, 0.01),
         'max_depth' : hp.quniform('max_depth', 3, 10, 1),
         'min_child_weight' : hp.quniform('min_child_weight', 3, 8, 1),
         'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
         'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
         }

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

print best

In [None]:
# Set the best parameters from GridSearch to initial Classifier. 
# Not using refitted GridSearch classifier..
clf.set_params(**best_parameters)

# clf = XGBClassifier(
#     max_depth=5, 
#     min_child_weight=5,
#     n_estimators=5000,
#     objective = 'binary:logistic',
#     learning_rate=0.01,  
#     subsample=0.8, 
#     colsample_bytree=0.65, 
#     seed=27)

xgbParams = clf.get_xgb_params()
trainMatrix = xgb.DMatrix(X.values, label=Y.values)

cvResult = xgb.cv(xgbParams, trainMatrix, num_boost_round=clf.get_params()['n_estimators'], nfold=folds,
    metrics=['auc'], early_stopping_rounds=maxRounds, show_progress=False)

clf.set_params(n_estimators=cvResult.shape[0])

In [None]:
clf.fit(X, Y, eval_metric='auc')

# Checking best fitted classifier predictions on training data.
print "Accuracy : %.4g" % accuracy_score(Y.values, clf.predict(X))
print "AUC Score: %f" % roc_auc_score(Y, clf.predict_proba(X)[:,1])

## Prediction

In [None]:
testX = testData.drop([idCol], axis=1)
testY = clf.predict_proba(testX)   

submission = pd.DataFrame({idCol: testData[idCol], targetCol: testY[:,1]})
submission.to_csv("output/submission.csv", index=False)