In [1]:
import os, sys
from datetime import datetime
import numpy as np, pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import auc, roc_auc_score

import xgboost as xgb

sys.path.append('C:\\Users\\Pawel\\PycharmProjects\\Springleaf Marketing Response\\src')
sys.path.append('C:\\Users\\Pawel\\PycharmProjects\\Springleaf Marketing Response\\src\\model_evaluation')
from xgboost_tester import XGBoostTester
import utilities

%matplotlib inline

In [2]:
DATA_DIR = "F:\\Pawel\\Kaggle\\Springleaf Marketing Response\\Data"
FEATURES_DATA_DIR = "F:\\Pawel\\Kaggle\\Springleaf Marketing Response\\Data\\Train_Standard_With_Dates_And_One_Hot"
PREDICTIONS_DIR = "F:\\Pawel\\Kaggle\\Springleaf Marketing Response\\Data\\Model Predictions"

RESULTS_DIR = "F:\\Pawel\\Kaggle\\Springleaf Marketing Response\\Data\\Model Results"

In [3]:
X_train = pd.read_hdf(os.path.join(FEATURES_DATA_DIR, 'train_train_features.hf5'), 'data')
X_validation = pd.read_hdf(os.path.join(FEATURES_DATA_DIR, 'train_validation_features.hf5'), 'data')
Y_train = pd.read_hdf(os.path.join(DATA_DIR, 'train_train_y.hf5'), 'data')
Y_validation = pd.read_hdf(os.path.join(DATA_DIR, 'train_validation_y.hf5'), 'data')

In [4]:
(X_train.columns == X_validation.columns).all()

True

In [5]:
X_train.fillna(-1, inplace=True)
X_validation.fillna(-1, inplace=True)

In [6]:
cols = X_train.columns.tolist()
cols[54] = 'VAR_0044_kurwa'
for i in range(0, len(cols)):
    cols[i] = cols[i].replace('_', '').replace('-', '').replace(' ', '')
X_train.columns = cols
X_validation.columns = cols

In [7]:
dtrain = xgb.DMatrix(X_train, Y_train['target'], feature_names=X_train.columns.tolist())
dtest = xgb.DMatrix(X_validation, Y_validation['target'], feature_names=X_validation.columns.tolist())

In [8]:
evals_result = {}
param = {'bst:max_depth': 12, 'bst:eta': 0.02, 'objective':'binary:logistic', 'min_child_weight': 10, 'colsample_bytree': 0.1, 'subsample': 0.5, 'nthread': 3, 'eval_metric': 'auc', 'silent': 1, 'seed': 0}
plst = param.items()
plst = [('eval_metric', 'auc')] # Multiple evals can be handled in this way
eval_list  = [(dtrain,'train'), (dtest,'eval')]
num_round = 200
bst = xgb.train(param, dtrain, num_round, eval_list, evals_result=evals_result)#, verbose_eval=False)

[0]	train-auc:0.742734	eval-auc:0.697484
[1]	train-auc:0.769576	eval-auc:0.717014
[2]	train-auc:0.786696	eval-auc:0.730088
[3]	train-auc:0.798137	eval-auc:0.737483
[4]	train-auc:0.804429	eval-auc:0.740915
[5]	train-auc:0.807297	eval-auc:0.743523
[6]	train-auc:0.809825	eval-auc:0.745867
[7]	train-auc:0.813050	eval-auc:0.748262
[8]	train-auc:0.814991	eval-auc:0.749908
[9]	train-auc:0.815780	eval-auc:0.751016
[10]	train-auc:0.817411	eval-auc:0.752659
[11]	train-auc:0.819092	eval-auc:0.752991
[12]	train-auc:0.821062	eval-auc:0.754247
[13]	train-auc:0.822407	eval-auc:0.754730
[14]	train-auc:0.823272	eval-auc:0.755609
[15]	train-auc:0.823801	eval-auc:0.755901
[16]	train-auc:0.824625	eval-auc:0.756289
[17]	train-auc:0.826189	eval-auc:0.757239
[18]	train-auc:0.826822	eval-auc:0.757600
[19]	train-auc:0.828021	eval-auc:0.757754
[20]	train-auc:0.828919	eval-auc:0.758346
[21]	train-auc:0.829777	eval-auc:0.758529
[22]	train-auc:0.830489	eval-auc:0.759125
[23]	train-auc:0.831215	eval-auc:0.759628
[2

In [9]:
feat_importances = bst.get_fscore()

In [20]:
X_train.shape

(116184, 2473)

In [23]:
FEATURE_THRESHOLD = 20

features_to_keep = []

for i in range(0, X_train.shape[1]):
    if X_train.columns[i] not in feat_importances or feat_importances[X_train.columns[i]] < FEATURE_THRESHOLD:
        continue
    features_to_keep.append(X_train.columns[i])

print len(features_to_keep)

1068


In [24]:
dtrain = xgb.DMatrix(X_train[features_to_keep], Y_train['target'], feature_names=features_to_keep)
dtest = xgb.DMatrix(X_validation[features_to_keep], Y_validation['target'], feature_names=features_to_keep)

In [26]:
evals_result = {}
param = {'bst:max_depth': 12, 'bst:eta': 0.02, 'objective':'binary:logistic', 'min_child_weight': 10, 'colsample_bytree': 0.2, 'subsample': 0.5, 'nthread': 3, 'eval_metric': 'auc', 'silent': 1, 'seed': 0}
plst = param.items()
plst = [('eval_metric', 'auc')] # Multiple evals can be handled in this way
eval_list  = [(dtrain,'train'), (dtest,'eval')]
num_round = 200
bst = xgb.train(param, dtrain, num_round, eval_list, evals_result=evals_result)#, verbose_eval=False)

[0]	train-auc:0.749397	eval-auc:0.691848
[1]	train-auc:0.782517	eval-auc:0.723052
[2]	train-auc:0.795729	eval-auc:0.734712
[3]	train-auc:0.804052	eval-auc:0.741010
[4]	train-auc:0.810505	eval-auc:0.746023
[5]	train-auc:0.814571	eval-auc:0.749720
[6]	train-auc:0.818307	eval-auc:0.752114
[7]	train-auc:0.820095	eval-auc:0.753276
[8]	train-auc:0.821748	eval-auc:0.754544
[9]	train-auc:0.822699	eval-auc:0.756303
[10]	train-auc:0.824590	eval-auc:0.757370
[11]	train-auc:0.825920	eval-auc:0.757883
[12]	train-auc:0.827260	eval-auc:0.758427
[13]	train-auc:0.828713	eval-auc:0.758966
[14]	train-auc:0.829545	eval-auc:0.759856
[15]	train-auc:0.830345	eval-auc:0.759673
[16]	train-auc:0.831331	eval-auc:0.759852
[17]	train-auc:0.832105	eval-auc:0.760144
[18]	train-auc:0.833293	eval-auc:0.760879
[19]	train-auc:0.834007	eval-auc:0.761160
[20]	train-auc:0.834881	eval-auc:0.761610
[21]	train-auc:0.835698	eval-auc:0.762010
[22]	train-auc:0.836398	eval-auc:0.762389
[23]	train-auc:0.837366	eval-auc:0.762712
[2

KeyboardInterrupt: 