In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train_converted.csv")

In [3]:
test = pd.read_csv('test_converted.csv')

In [4]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import random
from operator import itemgetter
import time
import copy

random.seed(2)




In [5]:
random_state = 0

In [6]:
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()

def intersect(a, b):
    return list(set(a) & set(b))




In [7]:
eta = 0.1
max_depth = 10
subsample = 0.8
colsample_bytree = 0.8
start_time = time.time()

print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": 0
}

early_stopping_rounds = 5
test_size = 0.1



XGBoost params. ETA: 0.1, MAX_DEPTH: 10, SUBSAMPLE: 0.8, COLSAMPLE_BY_TREE: 0.8


In [8]:
num_boost_round = 4000

In [9]:
train.columns

Index(['people_id', 'activity_id', 'activity_category', 'char_1_x', 'char_2_x',
       'char_3_x', 'char_4_x', 'char_5_x', 'char_6_x', 'char_7_x', 'char_8_x',
       'char_9_x', 'char_10_x', 'outcome', 'year_x', 'month_x', 'day_x',
       'char_1_y', 'group_1', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y',
       'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y', 'char_10_y', 'char_11',
       'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
       'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
       'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
       'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
       'char_36', 'char_37', 'char_38', 'year_y', 'month_y', 'day_y'],
      dtype='object')

In [10]:
features = ['char_1_x', 'char_10_x',
       'char_3_x', 'char_4_x', 'char_5_x',  'char_7_x', 'char_8_x',
       'char_9_x', 'year_x', 'year_x', 'month_x', 'day_x',
       'char_1_y', 'group_1', 'char_2_y', 'char_3_y', 'char_4_y', 'char_5_y',
       'char_6_y', 'char_7_y', 'char_8_y', 'char_9_y', 'char_10_y', 'char_11',
       'char_12', 'char_13', 'char_14', 'char_15', 'char_16', 'char_17',
       'char_18', 'char_19', 'char_20', 'char_21', 'char_22', 'char_23',
       'char_24', 'char_25', 'char_26', 'char_27', 'char_28', 'char_29',
       'char_30', 'char_31', 'char_32', 'char_33', 'char_34', 'char_35',
       'char_36', 'char_37', 'char_38', 'year_y', 'month_y', 'day_y']

In [11]:
target = 'outcome'
X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)

print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration+1)
score = roc_auc_score(X_valid[target].values, check)
print('Check error value: {:.6f}'.format(score))

imp = get_importance(gbm, features)
print('Importance array: ', imp)

print("Predict test set...")
test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)

print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
results  = test_prediction.tolist()
print(score)



Length train: 1977561
Length valid: 219730
Will train until eval error hasn't decreased in 5 rounds.
[0]	train-auc:0.944994	eval-auc:0.944689
[1]	train-auc:0.953886	eval-auc:0.953286
[2]	train-auc:0.955561	eval-auc:0.955085
[3]	train-auc:0.959028	eval-auc:0.958473
[4]	train-auc:0.960424	eval-auc:0.959810
[5]	train-auc:0.962354	eval-auc:0.961816
[6]	train-auc:0.962845	eval-auc:0.962320
[7]	train-auc:0.963587	eval-auc:0.963021
[8]	train-auc:0.964650	eval-auc:0.964077
[9]	train-auc:0.964968	eval-auc:0.964395
[10]	train-auc:0.965440	eval-auc:0.964864
[11]	train-auc:0.966182	eval-auc:0.965628
[12]	train-auc:0.966683	eval-auc:0.966104
[13]	train-auc:0.967375	eval-auc:0.966811
[14]	train-auc:0.967881	eval-auc:0.967321
[15]	train-auc:0.968534	eval-auc:0.967931
[16]	train-auc:0.969040	eval-auc:0.968448
[17]	train-auc:0.969572	eval-auc:0.968934
[18]	train-auc:0.970227	eval-auc:0.969601
[19]	train-auc:0.970500	eval-auc:0.969887
[20]	train-auc:0.971262	eval-auc:0.970646
[21]	train-auc:0.971819	eva

In [12]:

def create_submission(score, test, prediction):
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('activity_id,outcome\n')
    total = 0
    for id in test['activity_id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()



In [13]:
create_submission(score, test, test_prediction)

Writing submission:  submission_0.9995241933_2016-09-11-10-17.csv


Importance array:  [('group_1', 74816), ('char_38', 37232), ('month_x', 36850), ('char_7_y', 23362), ('month_y', 21492), ('char_3_y', 18970), ('char_4_y', 15776), ('char_8_y', 13789), ('char_9_y', 12299), ('char_5_y', 12195), ('year_x', 10953), ('char_6_y', 8533), ('year_y', 8400), ('char_25', 3459), ('char_13', 2877), ('char_31', 2725), ('char_10_y', 2473), ('char_1_y', 2400), ('char_34', 2357), ('char_12', 2305), ('char_35', 2252), ('char_14', 2239), ('char_11', 2236), ('char_1_x', 2140), ('char_27', 2055), ('char_18', 2007), ('char_16', 1987), ('char_29', 1979), ('char_15', 1977), ('char_17', 1921), ('char_32', 1891), ('char_20', 1855), ('char_36', 1843), ('char_26', 1821), ('char_30', 1774), ('char_23', 1740), ('char_33', 1693), ('char_24', 1673), ('char_9_x', 1506), ('char_19', 1505), ('char_22', 1455), ('char_21', 1442), ('char_37', 1208), ('char_3_x', 1181), ('char_2_y', 1118), ('char_8_x', 1048), ('char_7_x', 987), ('char_28', 925), ('char_5_x', 724), ('char_4_x', 586)]