In [25]:
import numpy as np
import pandas as pd
import xgboost as xgb

from pandas.core.common import array_equivalent
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import GridSearchCV
from xgboost.sklearn import XGBClassifier

## Data Exploration

Let us have a peek at some of this data!



In [4]:
idCol = "ID"
targetCol = "TARGET"

testData = pd.read_csv('data/test.csv')
trainData = pd.read_csv('data/train.csv')

In [5]:
print 'Number of training rows:', len(trainData)

trainData.head()

Number of training rows: 76020


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,39205.17,0
1,3,2,34,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,49278.03,0
2,4,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,67333.77,0
3,8,2,37,0,195,195,0,0,0,0,...,0,0,0,0,0,0,0,0,64007.97,0
4,10,2,39,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,117310.979016,0


In [6]:
print 'Number of test rows:', len(testData)

testData.head()

Number of test rows: 75818


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38
0,2,2,32,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,40532.1
1,5,2,35,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,45486.72
2,6,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46993.95
3,7,2,24,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,187898.61
4,9,2,23,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,73649.73


Seeing lots of zeros/similarity in the columns above. Let's check the variance of data in these columns and remove the redundant ones, if any. 

In [7]:
colCount = trainData.shape[1]

for column in trainData:
    
    if trainData[column].std() == 0: # or len(pd.unique(..)) < 2
        trainData.drop(column, axis=1, inplace=True)
        testData.drop(column, axis=1, inplace=True)
        
print colCount - trainData.shape[1], 'columns removed from test/train data.'

34 columns removed from test/train data.


Let's remove any duplicate columns as well.

In [8]:
# stackoverflow.com/questions/python-pandas-remove-duplicate-columns

def duplicate_columns(frame):
    groups = frame.columns.to_series().groupby(frame.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = frame[v].columns
        vs = frame[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if array_equivalent(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

In [9]:
colCount = trainData.shape[1]

dupCols = duplicate_columns(trainData)
trainData.drop(dupCols, axis=1, inplace=True)
testData.drop(dupCols, axis=1, inplace=True)

print colCount - trainData.shape[1], 'columns removed from test/train data.'

29 columns removed from test/train data.


## Training

In [37]:
folds = 5
maxRounds = 50

clf = XGBClassifier(
    max_depth=5, 
    min_child_weight=5,
    n_estimators=5000,
    objective = 'binary:logistic',
    learning_rate=0.01,  
    subsample=0.8, 
    colsample_bytree=0.65, 
    seed=27)

X = trainData.drop([idCol, targetCol], axis=1)
Y = trainData[targetCol]

xgbParams = clf.get_xgb_params()
trainMatrix = xgb.DMatrix(X.values, label=Y.values)

cvResult = xgb.cv(xgbParams, trainMatrix, num_boost_round=clf.get_params()['n_estimators'], nfold=folds,
    metrics=['auc'], early_stopping_rounds=maxRounds, show_progress=False)

clf.set_params(n_estimators=cvResult.shape[0])
    
clf.fit(X, Y, eval_metric='auc')

predictions = clf.predict(X)
print "Accuracy : %.4g" % metrics.accuracy_score(Y.values, predictions)

predictionProbs = clf.predict_proba(X)
print "AUC Score (Train): %f" % metrics.roc_auc_score(Y, predictionProbs[:,1])

Will train until cv error hasn't decreased in 50 rounds.
Stopping. Best iteration: 821



Model Report
Accuracy : 0.9609
AUC Score (Train): 0.871702


In [36]:
# Tuning various xgb params. 
paramsToTest = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}

gs = GridSearchCV(estimator = 
                        XGBClassifier(max_depth=5,
                                      min_child_weight=5,
                                      n_estimators=158,
                                      objective = 'binary:logistic',
                                      learning_rate=0.05,  
                                      subsample=0.8, 
                                      colsample_bytree=0.65, 
                                      seed=27), 
param_grid = paramsToTest, scoring='roc_auc', n_jobs=4, iid=False, cv=folds)

gs.fit(X,Y)
gs.grid_scores_, gs.best_params_, gs.best_score_

([mean: 0.84105, std: 0.00944, params: {'reg_lambda': 1e-05},
  mean: 0.84117, std: 0.00948, params: {'reg_lambda': 0.01},
  mean: 0.84103, std: 0.00980, params: {'reg_lambda': 0.1},
  mean: 0.84137, std: 0.00974, params: {'reg_lambda': 1},
  mean: 0.83812, std: 0.00854, params: {'reg_lambda': 100}],
 {'reg_lambda': 1},
 0.84137355189602603)

## Prediction

In [39]:
testX = testData.drop([idCol], axis=1)
testY = clf.predict_proba(testX)   

submission = pd.DataFrame({idCol: testData[idCol], targetCol: testY[:,1]})
submission.to_csv("output/submission.csv", index=False)