In [19]:
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score

In [20]:
warnings.filterwarnings('ignore')
# Set features name
trainSetName = []
testSetName = []
for index in range(32):
    trainSetName.append('F' + str(index+1))
    testSetName.append('F' + str(index+1))
    
trainSetName.append('label')

dataPath = '../dataSet/'
# Load DataSet
trainData = pd.read_csv(dataPath+'trainSet.csv', names=trainSetName)
# Drop header
trainData = trainData.drop([0])

trainLabel = trainData['label'].astype(int)
trainLabel = np.array(trainLabel.tolist())
del trainData['label']

testData = pd.read_csv(dataPath+'testSet.csv', names=testSetName)
testData = testData.drop([0])

# Scale datas
scaler = MinMaxScaler(feature_range=(-1,1))
trainData = scaler.fit_transform(trainData)
testData = scaler.transform(testData)

In [21]:
# Randomized CV for XGBoost parameters tuning
param_test = {
    'max_depth': range(3, 10, 1)
}

XGBModel = XGBClassifier(
    learning_rate = 0.01,
    n_estimators = 2000,
    booster = 'gblinear',
    objective = 'binary:logistic',
    min_child_weight = 1,
    gamma = 0,
    subsample = 0.8,
    colsample_bytree = 0.8,
    scale_pos_weight = 1,
    nthread = 8
)

trainSet = xgb.DMatrix(trainData, label=trainLabel)

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=7, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)

print(rSearch.best_params_)

XGBModel.set_params(max_depth= rSearch.best_params_['max_depth'])

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.681599, total= 1.2min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.8min remaining:    0.0s
[CV] max_depth=3 .....................................................
[CV] ...................... max_depth=3, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.2min remaining:    0.0s
[CV] max_depth=3 ...................................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [22]:
param_test = {
    'min_child_weight': range(1, 8, 1)
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=7, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(min_child_weight= rSearch.best_params_['min_child_weight'])

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] min_child_weight=1 ..............................................
[CV] ............... min_child_weight=1, score=0.681599, total= 1.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[CV] min_child_weight=1 ..............................................
[CV] ............... min_child_weight=1, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[CV] min_child_weight=1 ..............................................
[CV] ............... min_child_weight=1, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s
[CV] min_child_weight=1 ..............................................
[CV] ............... min_child_weight=1, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.3min remaining:    0.0s
[CV] min_child_weight=1 ............................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=2000,
       n_jobs=1, nthread=8, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8)

In [23]:
param_test = {
    'gamma': [i/10.0 for i in range(0, 5)]
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=5, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(gamma= rSearch.best_params_['gamma'])

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] gamma=0.0 .......................................................
[CV] ........................ gamma=0.0, score=0.681599, total= 1.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[CV] gamma=0.0 .......................................................
[CV] ........................ gamma=0.0, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[CV] gamma=0.0 .......................................................
[CV] ........................ gamma=0.0, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s
[CV] gamma=0.0 .......................................................
[CV] ........................ gamma=0.0, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.4min remaining:    0.0s
[CV] gamma=0.0 .....................................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=8, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

In [24]:
param_test = {
    'subsample': [i/10.0 for i in range(6, 10)]
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=4, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(subsample= rSearch.best_params_['subsample'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] subsample=0.6 ...................................................
[CV] .................... subsample=0.6, score=0.681599, total= 1.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s
[CV] subsample=0.6 ...................................................
[CV] .................... subsample=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[CV] subsample=0.6 ...................................................
[CV] .................... subsample=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s
[CV] subsample=0.6 ...................................................
[CV] .................... subsample=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.4min remaining:    0.0s
[CV] subsample=0.6 .................................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=8, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6)

In [25]:
param_test = {
    'colsample_bytree': [i/10.0 for i in range(6, 10)]
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=4, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(colsample_bytree= rSearch.best_params_['colsample_bytree'])

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] colsample_bytree=0.6 ............................................
[CV] ............. colsample_bytree=0.6, score=0.681599, total= 1.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[CV] colsample_bytree=0.6 ............................................
[CV] ............. colsample_bytree=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[CV] colsample_bytree=0.6 ............................................
[CV] ............. colsample_bytree=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s
[CV] colsample_bytree=0.6 ............................................
[CV] ............. colsample_bytree=0.6, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.3min remaining:    0.0s
[CV] colsample_bytree=0.6 ..........................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=8, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6)

In [26]:
param_test = {
    'reg_alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=6, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(reg_alpha= rSearch.best_params_['reg_alpha'])

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] reg_alpha=0.0001 ................................................
[CV] ................. reg_alpha=0.0001, score=0.681620, total= 1.3min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.3min remaining:    0.0s
[CV] reg_alpha=0.0001 ................................................
[CV] ................. reg_alpha=0.0001, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.7min remaining:    0.0s
[CV] reg_alpha=0.0001 ................................................
[CV] ................. reg_alpha=0.0001, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.0min remaining:    0.0s
[CV] reg_alpha=0.0001 ................................................
[CV] ................. reg_alpha=0.0001, score=0.680499, total= 1.3min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.3min remaining:    0.0s
[CV] reg_alpha=0.0001 ..............................

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.0, learning_rate=0.01,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=8, objective='binary:logistic',
       random_state=0, reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6)

In [35]:
param_test = {
    'reg_lambda': [1e-4, 1e-3, 1e-2, 1e-1, 1, 10]
}

rSearch = RandomizedSearchCV(XGBModel, param_test, n_iter=6, scoring='f1', iid=False, cv=5, verbose=50)
rSearch.fit(trainData, trainLabel)
print(rSearch.best_params_)

XGBModel.set_params(reg_lambda= rSearch.best_params_['reg_lambda'])

cvResult = xgb.cv(XGBModel.get_xgb_params(), trainSet, num_boost_round=2000, nfold=5,
                    metrics='auc', early_stopping_rounds=20)

print(cvResult.shape[0])
XGBModel.set_params(n_estimators=cvResult.shape[0])

print(XGBModel.get_xgb_params())

params = XGBModel.get_xgb_params()

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] reg_lambda=0.0001 ...............................................
[CV] ................ reg_lambda=0.0001, score=0.027331, total= 1.2min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s
[CV] reg_lambda=0.0001 ...............................................
[CV] ................ reg_lambda=0.0001, score=0.590370, total= 1.3min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.5min remaining:    0.0s
[CV] reg_lambda=0.0001 ...............................................
[CV] ................ reg_lambda=0.0001, score=0.676439, total= 1.5min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.9min remaining:    0.0s
[CV] reg_lambda=0.0001 ...............................................
[CV] ................ reg_lambda=0.0001, score=0.653667, total= 1.6min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  5.6min remaining:    0.0s
[CV] reg_lambda=0.0001 .............................

In [55]:

params = {
    'learning_rate': 0.01,
    'booster': 'gblinear',
    'objective': 'binary:logistic',
    'min_child_weight': 1,
    'max_depth': 4,
    'gamma': 0.0,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'nthread': 8
}


n_folds = 10
rounds = 500
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=None)
total_accuracy = 0

for index, (trainIndex, testIndex) in enumerate(kf.split(trainData, trainLabel)):
    tr_x = trainData[trainIndex]
    tr_y = trainLabel[trainIndex]
    te_x = trainData[testIndex]
    te_y = trainLabel[testIndex]
    
    trainSet = xgb.DMatrix(tr_x, label=tr_y)
    testSet = xgb.DMatrix(te_x)
    
    model = xgb.train(params, trainSet, rounds, verbose_eval=10)
    preds = model.predict(testSet)
    pred = []
    for ele in preds:
        if ele < 0.5:
            pred.append(0)
        else:
            pred.append(1)
    
    accuracy = f1_score(pred, te_y)
    total_accuracy += accuracy
    # Decide KFold scores
    print('Iteration %d, F1_Score: %.7f' % (index, accuracy))

print('Mean F1_Score: %.7f' % (total_accuracy / n_folds))

Iteration 0, F1_Score: 0.6110688
Iteration 1, F1_Score: 0.6197490
Iteration 2, F1_Score: 0.6141312
Iteration 3, F1_Score: 0.6144596
Iteration 4, F1_Score: 0.6150114
Iteration 5, F1_Score: 0.6114547
Iteration 6, F1_Score: 0.6133409
Iteration 7, F1_Score: 0.6128110
Iteration 8, F1_Score: 0.6145007
Iteration 9, F1_Score: 0.6092475
Mean F1_Score: 0.6135775


In [56]:
# Output final prediction
trainSet = xgb.DMatrix(trainData, label=trainLabel)
testSet = xgb.DMatrix(testData)

model = xgb.train(params, trainSet, rounds, verbose_eval=10)

preds = model.predict(testSet)
pred = []
for ele in preds:
    if ele < 0.5:
        pred.append(0)
    else:
        pred.append(1)

result = pd.DataFrame(columns=['ID', 'Predicted'])
result['ID'] = [x+1 for x in range(len(testData))]
result['Predicted'] = pred
print(result)

result.to_csv('../result/submission.csv', index=False)

            ID  Predicted
0            1          1
1            2          1
2            3          1
3            4          1
4            5          1
5            6          1
6            7          1
7            8          1
8            9          1
9           10          1
10          11          1
11          12          1
12          13          1
13          14          1
14          15          1
15          16          1
16          17          1
17          18          1
18          19          1
19          20          1
20          21          1
21          22          1
22          23          1
23          24          1
24          25          1
25          26          1
26          27          1
27          28          1
28          29          1
29          30          1
...        ...        ...
183237  183238          1
183238  183239          1
183239  183240          1
183240  183241          1
183241  183242          1
183242  183243          1
183243  1832