In [6]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg") #Needed to save figures
from sklearn import cross_validation
import xgboost as xgb
from sklearn.metrics import roc_auc_score

training = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)

print(training.shape)
print(test.shape)

print('Percentage unhappy {:04.2f}'.format(100*training.TARGET.sum()/training.shape[0]))

# Replace -999999 in var3 column with most common value 2 
# See https://www.kaggle.com/cast42/santander-customer-satisfaction/debugging-var3-999999
# for details
training = training.replace(-999999,2)

# Replace 9999999999 with NaN
# See https://www.kaggle.com/c/santander-customer-satisfaction/forums/t/19291/data-dictionary/111360#post111360
# training = training.replace(9999999999, np.nan)
# training.dropna(inplace=True)
# Leads to validation_0-auc:0.839577
#
# Not dropping the 9999999999 rows but adding missing=9999999999 in XGBClassifier
# leads to validation_0-auc:0.847446

X = training.iloc[:,:-1]
y = training.TARGET

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selectK = SelectKBest(f_classif, k=220)
selectK.fit(X, y)
X_sel = selectK.transform(X)

features = X.columns[selectK.get_support()]
print (features)

X_train, X_test, y_train, y_test = \
   cross_validation.train_test_split(X_sel, y, random_state=1301, stratify=y, test_size=0.1)
   
print('Percentage unhappy in train {:04.2f}'.format(100*y_train.sum()/y_train.shape[0]))
print('Percentage unhappy in test {:04.2f}'.format(100*y_test.sum()/y_test.shape[0]))

clf = xgb.XGBClassifier(missing=9999999999,
                max_depth = 4,
                n_estimators=5000,
                learning_rate=0.001, 
                nthread=4,
                subsample=0.75,
                colsample_bytree=0.75, 
                seed=4242)
clf.fit(X_train, y_train, early_stopping_rounds=50, eval_metric="auc",
        eval_set=[(X_train, y_train),(X_test, y_test)])
        
print('Overall AUC:', roc_auc_score(y, clf.predict_proba(X_sel, ntree_limit=clf.best_iteration)[:,1]))
    
sel_test = selectK.transform(test)    
y_pred = clf.predict_proba(sel_test, ntree_limit=clf.best_iteration)

submission = pd.DataFrame({"ID":test.index, "TARGET":y_pred[:,1]})
submission.to_csv("submission.csv", index=False)

mapFeat = dict(zip(["f"+str(i) for i in range(len(features))],features))
ts = pd.Series(clf.booster().get_fscore())
ts.index = ts.reset_index()['index'].map(mapFeat)
ts.sort_values()[-15:].plot(kind="barh", title=("features importance"))

featp = ts.sort_values()[-15:].plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
fig_featp = featp.get_figure()
fig_featp.savefig('feature_importance_xgb.png', bbox_inches='tight', pad_inches=1)


(76020, 370)
(75818, 369)
Percentage unhappy 3.00
Index([u'var3', u'var15', u'imp_op_var39_comer_ult1',
       u'imp_op_var39_comer_ult3', u'imp_op_var40_comer_ult1',
       u'imp_op_var40_efect_ult1', u'imp_op_var40_efect_ult3',
       u'imp_op_var40_ult1', u'imp_op_var41_comer_ult1',
       u'imp_op_var41_comer_ult3',
       ...
       u'saldo_medio_var13_largo_ult3', u'saldo_medio_var33_hace2',
       u'saldo_medio_var33_hace3', u'saldo_medio_var33_ult1',
       u'saldo_medio_var33_ult3', u'saldo_medio_var44_hace2',
       u'saldo_medio_var44_hace3', u'saldo_medio_var44_ult1',
       u'saldo_medio_var44_ult3', u'var38'],
      dtype='object', length=220)
Percentage unhappy in train 3.00

 189 192 220 222 234 238 244 248 261 262 303 307 315 319 327 349] are constant.
Will train until validation_1 error hasn't decreased in 50 rounds.
[0]	validation_0-auc:0.702667	validation_1-auc:0.723083
[1]	validation_0-auc:0.762937	validation_1-auc:0.777355
[2]	validation_0-auc:0.773225	validation_1-auc:0.787462
[3]	validation_0-auc:0.778162	validation_1-auc:0.793720
[4]	validation_0-auc:0.797747	validation_1-auc:0.815604
[5]	validation_0-auc:0.800003	validation_1-auc:0.816877
[6]	validation_0-auc:0.803384	validation_1-auc:0.820940
[7]	validation_0-auc:0.805517	validation_1-auc:0.823984
[8]	validation_0-auc:0.812600	validation_1-auc:0.830186
[9]	validation_0-auc:0.815226	validation_1-auc:0.834194
[10]	validation_0-auc:0.816074	validation_1-auc:0.835266
[11]	validation_0-auc:0.816384	validation_1-auc:0.836877
[12]	validation_0-auc:0.814541	validation_1-auc:0.834174
[13]	validation_0-auc:0.816201	validation_1-auc:0.835365
[14]	validation_0-auc:0.816909	validation_1-auc:0.836357
[15]	val


Percentage unhappy in test 3.00
('Overall AUC:', 0.83023551724519784)


[72]	validation_0-auc:0.815138	validation_1-auc:0.832063
Stopping. Best iteration:
[22]	validation_0-auc:0.820897	validation_1-auc:0.840473

