In [1]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import time
import datetime
import warnings
warnings.filterwarnings('ignore')



In [2]:
action_tr = pd.read_csv('../data/input/train/action_train.csv')  # 用户行为数据
order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')  # 待预测数据
order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv')  # 用户历史订单数据
user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv')  # 用户评论数据
user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv')  # 用户个人信息

action_te = pd.read_csv('../data/input/test/action_test.csv')
order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')
order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv')
user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv')
user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv')

In [3]:
action = pd.concat([action_tr, action_te], axis=0).reset_index(drop=True)
order_history = pd.concat([order_history_tr, order_history_te], axis=0).reset_index(drop=True)
user_comment = pd.concat([user_comment_tr, user_comment_te], axis=0).reset_index(drop=True)
user_profile = pd.concat([user_profile_tr, user_profile_te], axis=0).reset_index(drop=True)

In [4]:
def translate_date(date):
    year = int(date[:4])
    month = int(date[-2:])
    return (year - 2010) * 12 + month

In [5]:
def cal_time_gap(t):
    s = '2018-01-01'
    return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d").timetuple()) - t

In [6]:
def get_interaction_feature(df, feature_A, feature_B):
    feature_A_list = sorted(df[feature_A].unique())
    feature_B_list = sorted(df[feature_B].unique())
    count = 0
    mydict = {}
    for i in feature_A_list:
        mydict[int(i)] = {}
        for j in feature_B_list:
            mydict[int(i)][int(j)] = count
            count += 1
    return df.apply(lambda x: mydict[int(x[feature_A])][int(x[feature_B])], axis=1)

In [7]:
def print_label_encoder_mapping(le):
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    for k, v in le_name_mapping.items():
        print str(k) + ': ' + str(v)

In [8]:
def get_user_profile_feature(df):
    df = df.copy()

    mydf = df[['userid']]
    le = preprocessing.LabelEncoder()
    mydf['gender'] = le.fit_transform(df['gender'])

    mydf['province'] = le.fit_transform(df['province'])

    mydf['age'] = le.fit_transform(df['age'])

    return mydf

In [9]:
def get_action_feature(df):
    df = df.copy()

    df = df.sort_values(by=['userid', 'actionTime'])
    
    mydf = df.groupby('userid').size().reset_index()
    mydf.columns = ['userid', 'action_num']

    act_type_last = mydf[['userid']]
    for i in [1, 2, 3, 4]:
        tmp = df.groupby('userid').nth(-i)['actionType'].reset_index()
        tmp.columns = ['userid', 'act_type_last' + str(i)]
        act_type_last = pd.merge(act_type_last, tmp, how='left', on='userid')

    act_type_first = mydf[['userid']]
    for i in [1, 2]:
        tmp = df.groupby('userid').nth(i)['actionType'].reset_index()
        tmp.columns = ['userid', 'act_type_first' + str(i)]
        act_type_first = pd.merge(act_type_first, tmp, how='left', on='userid')

    time_gap_stat = cal_time_gap(df['actionTime']).groupby(df['userid']).agg([np.mean, np.std, min, max, np.ptp]).reset_index()
    time_gap_stat.columns = ['userid', 'action_time_gap_mean', 'action_time_gap_std', 'action_time_gap_min', 'action_time_gap_max', 'action_time_gap_ptp']

    time_gap_last = mydf[['userid']]
    for i in [1, 2, 3, 4]:
        tmp = cal_time_gap(df['actionTime']).groupby(df['userid']).nth(-i).reset_index()
        tmp.columns = ['userid', 'time_gap_last' + str(i)]
        time_gap_last = pd.merge(time_gap_last, tmp, how='left', on='userid')

    time_gap_first = mydf[['userid']]
    for i in [1, 2]:
        tmp = cal_time_gap(df['actionTime']).groupby(df['userid']).nth(i).reset_index()
        tmp.columns = ['userid', 'time_gap_first' + str(i)]
        time_gap_first = pd.merge(time_gap_first, tmp, how='left', on='userid')
    
    time_gap_last123 = cal_time_gap(df['actionTime']).groupby(df['userid']).nth([-1, -2, -3]).reset_index()
    time_gap_last123_stat = time_gap_last123.groupby('userid')['actionTime'].agg([np.mean, np.std, min, max, np.ptp]).reset_index()
    time_gap_last123_stat.columns = ['userid', 'time_gap_last123_mean', 'time_gap_last123_std', 'time_gap_last123_min', 'time_gap_last123_max', 'time_gap_last123_ptp']

    df['dist'] = df.groupby('userid')['actionTime'].rank(ascending=False)
    dist = mydf[['userid']]
    for i in range(1, 10):
        tmp = df[df['actionType'] == i].groupby('userid')['dist'].nth(-1).reset_index()
        tmp.columns = ['userid', 'dist' + str(i)]
        dist = pd.merge(dist, tmp, how='left', on='userid')

    time_gap_all_type = mydf[['userid']]
    for i in range(1, 10):
        tmp = df[df['actionType'] == i].groupby('userid')['actionTime'].nth(-1).reset_index()
        tmp['actionTime'] = cal_time_gap(tmp['actionTime'])
        tmp.columns = ['userid', 'time_gap' + str(i)]
        time_gap_all_type = pd.merge(time_gap_all_type, tmp, how='left', on='userid')

    # 构建时间间隔序列 time_gap
    sub = df.loc[:(len(df) - 2), 'actionTime']  # 构建减数 Serise
    sub.index = sub.index + 1  # 索引加一
    df['time_gap'] = df.loc[1:, 'actionTime'] - sub  # 计算时间间隔
    tmp = df.groupby('userid').nth(0).reset_index()  # 找到用户的第一个 action
    tmp['time_gap'] = 0  # 把所有用户的第一个 action 的 time_gap 赋 0
    df = pd.merge(df, tmp, how='left', on=['userid', 'actionTime', 'actionType', 'dist']).fillna(1)  # merge 起来后，空值填充 1
    df['time_gap'] = df['time_gap_x'] * df['time_gap_y']  # 两列相乘，将所有用户的第一个 action 的 time_gap 赋 0
    df = df.drop(['time_gap_x', 'time_gap_y'], axis=1)  # 移除多余的列
    df = df.replace(0, np.nan)  # 0 -> NaN

    time_gap_seq_stat = mydf[['userid']]
    for i in range(1, 10):
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap'].agg([np.mean, np.std, min, max, np.ptp]).reset_index()
        tmp.columns = ['userid', 'time_gap_seq' + str(i) + '_mean', 'time_gap_seq' + str(i) + '_std', 'time_gap_seq' + str(i) + '_min', 'time_gap_seq' + str(i) + '_max', 'time_gap_seq' + str(i) + '_ptp']
        time_gap_seq_stat = pd.merge(time_gap_seq_stat, tmp, how='left', on='userid')

    for i in range(1, 10):
        time_gap_seq_stat['time_gap_seq' + str(i) + '_mean*std'] = time_gap_seq_stat['time_gap_seq' + str(i) + '_mean'] * time_gap_seq_stat['time_gap_seq' + str(i) + '_std']

    type_num_rate = mydf.copy()
    for i in range(1, 10):
        tmp = df[df['actionType'] == i].groupby('userid').size().reset_index()
        tmp.columns = ['userid', 'type' + str(i) + '_num']
        type_num_rate = pd.merge(type_num_rate, tmp, how='left', on='userid')
        type_num_rate['type' + str(i) + '_rate'] = type_num_rate['type' + str(i) + '_num'] / type_num_rate['action_num']
    type_num_rate = type_num_rate.drop('action_num', axis=1)
    
    # 构建时间间隔的间隔序列 time_gap_gap
    sub = df.loc[:(len(df) - 2), 'time_gap']  # 构建减数 Serise
    sub.index = sub.index + 1  # 索引加一
    df['time_gap_gap'] = df.loc[1:, 'time_gap'] - sub  # 计算时间间隔的间隔
    tmp = df.groupby('userid').nth([0, 1]).reset_index()  # 找到用户的第一和二个 action
    tmp['time_gap_gap'] = 0  # 把所有用户的第一和二个 action 的 time_gap_gap 赋 0
    df = pd.merge(df, tmp, how='left', on=['userid', 'actionTime', 'actionType', 'dist', 'time_gap']).fillna(1)  # merge 起来后，空值填充 1
    df['time_gap_gap_y'] = df['time_gap_gap_y'].replace(0, np.nan)  # 0 替换成 NaN
    df['time_gap_gap'] = df['time_gap_gap_x'] * df['time_gap_gap_y']  # 两列相乘，将所有用户的第一和二个 action 的 time_gap 赋 NaN
    df = df.drop(['time_gap_gap_x', 'time_gap_gap_y'], axis=1)  # 移除多余的列

    time_gap_gap_seq_stat = mydf[['userid']]
    for i in range(1, 10):
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap_gap'].agg([np.mean, np.std, min, max, np.ptp]).reset_index()
        tmp.columns = ['userid', 'time_gap_gap_seq' + str(i) + '_mean', 'time_gap_gap_seq' + str(i) + '_std', 'time_gap_gap_seq' + str(i) + '_min', 'time_gap_gap_seq' + str(i) + '_max', 'time_gap_gap_seq' + str(i) + '_ptp']
        time_gap_gap_seq_stat = pd.merge(time_gap_gap_seq_stat, tmp, how='left', on='userid')
    
    time_stat = df.groupby('userid')['actionTime'].agg([min, max, np.ptp, np.mean, np.std]).reset_index()
    time_stat.columns = ['userid', 'time_stat_min', 'time_stat_max', 'time_stat_ptp', 'time_stat_mean', 'time_stat_std']
    
    time_stat_gb_uid_type = df.groupby(['userid', 'actionType'])['actionTime'].agg([min, max, np.ptp, np.mean, np.std]).unstack().reset_index()
    columns_name = ['time_' + i + '_' + str(j) for i in ['min', 'max', 'ptp', 'mean', 'std'] for j in range(1, 10)]
    columns_name.insert(0, 'userid')
    time_stat_gb_uid_type.columns = columns_name

    mydf = pd.merge(mydf, act_type_last, how='left', on='userid')
    mydf = pd.merge(mydf, act_type_first, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_stat, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_last, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_first, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_last123_stat, how='left', on='userid')
    mydf = pd.merge(mydf, dist, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_all_type, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_seq_stat, how='left', on='userid')
    mydf = pd.merge(mydf, type_num_rate, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_gap_seq_stat, how='left', on='userid')
    mydf = pd.merge(mydf, time_stat, how='left', on='userid')
    mydf = pd.merge(mydf, time_stat_gb_uid_type, how='left', on='userid')

    return mydf

In [10]:
def get_order_history_feature(df):
    df = df.copy()
    
    df = df.sort_values(by=['userid', 'orderTime'])
    le = preprocessing.LabelEncoder()
    df['city'] = le.fit_transform(df['city'])
    df['country'] = le.fit_transform(df['country'])
    df['continent'] = le.fit_transform(df['continent'])

    mydf = df.groupby('userid')['orderType'].agg([len, sum, max]).reset_index()
    mydf.columns = ['userid', 'order_num', 'pos_num', 'if_pos']
    mydf['pos_rate'] = mydf['pos_num'] / mydf['order_num']

#     time_gap_last = mydf[['userid']]
#     for i in [1, 2, 3]:
#         tmp = cal_time_gap(df['orderTime']).groupby(df['userid']).nth(-i).reset_index()
#         tmp.columns = ['userid', 'ord_time_gap_last' + str(i)]
#         time_gap_last = pd.merge(time_gap_last, tmp, how='left', on='userid')

#     ord_type_last = mydf[['userid']]
#     for i in [1, 2, 3]:
#         tmp = df['orderType'].groupby(df['userid']).nth(-i).reset_index()
#         tmp.columns = ['userid', 'ord_type_last' + str(i)]
#         ord_type_last = pd.merge(ord_type_last, tmp, how='left', on='userid')

#     ord_time_gap_last123 = cal_time_gap(df['orderTime']).groupby(df['userid']).nth([-1, -2, -3]).reset_index()
#     ord_time_gap_last123_stat = ord_time_gap_last123.groupby('userid')['orderTime'].agg([np.mean, np.std, min, max, np.ptp]).reset_index()
#     ord_time_gap_last123_stat.columns = ['userid', 'ord_time_gap_last123_mean', 'ord_time_gap_last123_std', 'ord_time_gap_last123_min', 'ord_time_gap_last123_max', 'ord_time_gap_last123_ptp']
    
#     ord_type_last123 = df['orderType'].groupby(df['userid']).nth([-1, -2, -3]).reset_index()
#     ord_type_last123_stat = ord_type_last123.groupby('userid')['orderType'].agg([sum, min, max]).reset_index()
#     ord_type_last123_stat.columns = ['userid', 'ord_type_last123_sum', 'ord_type_last123_min', 'ord_type_last123_max']

#     # 构建时间间隔序列 time_gap
#     sub = df.loc[:(len(df) - 2), 'orderTime']  # 构建减数 Serise
#     sub.index = sub.index + 1  # 索引加一
#     df['time_gap'] = df.loc[1:, 'orderTime'] - sub  # 计算时间间隔
#     tmp = df.groupby('userid').nth(0).reset_index()  # 找到用户的第一个 action
#     tmp['time_gap'] = 0  # 把所有用户的第一个 action 的 time_gap 赋 0
#     df = pd.merge(df, tmp, how='left', on=['userid', 'orderid', 'orderType', 'city', 'country', 'continent']).fillna(1)  # merge 起来后，空值填充 1
#     df['time_gap'] = df['time_gap_x'] * df['time_gap_y']  # 两列相乘，将所有用户的第一个 action 的 time_gap 赋 0
#     df = df.drop(['time_gap_x', 'time_gap_y'], axis=1)  # 移除多余的列
#     df['time_gap'] = df['time_gap'].replace(0, np.nan)  # 0 -> NaN

#     ord_time_gap_seq_stat = mydf[['userid']]
#     lo = df[df['orderType'] == 1].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
#     hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
#     lo_hi = pd.merge(lo, hi, how='left', on='userid')
#     lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
#     index_list = []
#     for index, row in lo_hi.iterrows():
#         index_list += range(row['lo_ind'], row['hi_ind'] + 1)
#     tmp = df.iloc[index_list,:]
#     tmp = tmp.groupby('userid')['time_gap'].agg([np.mean, np.std, min, max, np.ptp]).reset_index()
#     tmp.columns = ['userid', 'ord_time_gap_seq1_mean', 'ord_time_gap_seq1_std', 'ord_time_gap_seq1_min', 'ord_time_gap_seq1_max', 'ord_time_gap_seq1_ptp']
#     ord_time_gap_seq_stat = pd.merge(ord_time_gap_seq_stat, tmp, how='left', on='userid')

    continent_num = df.groupby(['userid', 'continent']).size().unstack().reset_index()
    continent_num.columns = [i if i == 'userid' else 'continent' + str(i) for i in continent_num.columns]
    
    country_num = df.groupby(['userid', 'country']).size().unstack().reset_index()
    country_num.columns = [i if i == 'userid' else 'country' + str(i) for i in country_num.columns]
    
    city_num = df.groupby(['userid', 'city']).size().unstack().reset_index()
    city_num.columns = [i if i == 'userid' else 'city' + str(i) for i in city_num.columns]
    
#     mydf = pd.merge(mydf, time_gap_last, how='left', on='userid')
#     mydf = pd.merge(mydf, ord_type_last, how='left', on='userid')
#     mydf = pd.merge(mydf, ord_time_gap_last123_stat, how='left', on='userid')
#     mydf = pd.merge(mydf, ord_type_last123_stat, how='left', on='userid')
#     mydf = pd.merge(mydf, ord_time_gap_seq_stat, how='left', on='userid')
    mydf = pd.merge(mydf, continent_num, how='left', on='userid')
    mydf = pd.merge(mydf, country_num, how='left', on='userid')
    mydf = pd.merge(mydf, city_num, how='left', on='userid')

    return mydf

In [11]:
def get_user_comment_feature(df):
    df = df.copy()
    
    mydf = df.groupby('userid').size().reset_index()
    mydf.columns = ['userid', 'comment_num']
    
    rating = df[['userid', 'rating']]
    
    mydf = pd.merge(mydf, rating, how='left', on='userid')

    return mydf

In [None]:
user_profile_feat_tr = get_user_profile_feature(user_profile_tr)

In [None]:
action_feat_tr = get_action_feature(action_tr)

In [None]:
order_history_feat_tr = get_order_history_feature(order_history_tr)

In [None]:
user_comment_feat_tr = get_user_comment_feature(user_comment_tr)

In [None]:
dataset_tr = pd.merge(user_profile_feat_tr, action_feat_tr, on='userid', how='left')
dataset_tr = pd.merge(dataset_tr, order_history_feat_tr, on='userid', how='left')
dataset_tr = pd.merge(dataset_tr, user_comment_feat_tr, on='userid', how='left')

trainset = pd.merge(order_future_tr, dataset_tr, on='userid', how='left')

train_feature = trainset.drop(['orderType'], axis=1)
train_label = trainset.orderType.values

print train_feature.shape, train_label.shape

In [12]:
user_profile_feat = get_user_profile_feature(user_profile)
action_feat = get_action_feature(action)
order_history_feat = get_order_history_feature(order_history)
user_comment_feat = get_user_comment_feature(user_comment)

dataset = pd.merge(user_profile_feat, action_feat, on='userid', how='left')
dataset = pd.merge(dataset, order_history_feat, on='userid', how='left')
dataset = pd.merge(dataset, user_comment_feat, on='userid', how='left')

trainset = pd.merge(order_future_tr, dataset, on='userid', how='left')
testset = pd.merge(order_future_te, dataset, on='userid', how='left')

train_feature = trainset.drop(['orderType'], axis=1)
train_label = trainset.orderType.values
test_feature = testset
test_index = testset.userid.values

print train_feature.shape, train_label.shape
print test_feature.shape

(40307, 493) (40307L,)
(10076, 493)


In [13]:
config = {
    'rounds': 10000,
    'folds': 5
}

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'stratified': True,
    'scale_pos_weights ': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'lambda': 1,

    'eta': 0.02,
    'seed': 20,
    'silent': 1,
    'eval_metric': 'auc'
}

In [14]:
def xgb_cv(train_feature, train_label, params, folds, rounds):
    start = time.clock()
    print train_feature.columns
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    num_round = rounds
    print 'run cv: ' + 'round: ' + str(rounds)
    res = xgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=10, early_stopping_rounds=100)
    elapsed = (time.clock() - start)
    print 'Time used:', elapsed, 's'
    return len(res), res.loc[len(res) - 1, 'test-auc-mean']


def xgb_predict(train_feature, train_label, test_feature, rounds, params):
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    dtest = xgb.DMatrix(test_feature, label=np.zeros(test_feature.shape[0]))
    watchlist = [(dtrain, 'train')]
    num_round = rounds
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval=30)
    predict = model.predict(dtest)
    return model, predict


def store_result(test_index, pred, name):
    result = pd.DataFrame({'userid': test_index, 'orderType': pred})
    result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType'])
    return result

In [15]:
iterations, best_score = xgb_cv(train_feature, train_label, params, config['folds'], config['rounds'])

Index([u'userid', u'gender', u'province', u'age', u'action_num',
       u'act_type_last1', u'act_type_last2', u'act_type_last3',
       u'act_type_last4', u'act_type_first1',
       ...
       u'city210', u'city211', u'city212', u'city213', u'city214', u'city215',
       u'city216', u'city217', u'comment_num', u'rating'],
      dtype='object', length=493)
run cv: round: 10000


Will train until cv error hasn't decreased in 100 rounds.
[0]	cv-test-auc:0.84752+0.00943843921419	cv-train-auc:0.8542972+0.00644084619906
[10]	cv-test-auc:0.8974102+0.00398040040197	cv-train-auc:0.9045518+0.00256507554665
[20]	cv-test-auc:0.9022432+0.00535910068575	cv-train-auc:0.9101908+0.00191160115087
[30]	cv-test-auc:0.9058238+0.00611850747814	cv-train-auc:0.9139224+0.00268184963039
[40]	cv-test-auc:0.909079+0.00548902384764	cv-train-auc:0.9176832+0.00158159936773
[50]	cv-test-auc:0.9119432+0.00452251193033	cv-train-auc:0.9208756+0.00167270972975
[60]	cv-test-auc:0.9157662+0.00447105604528	cv-train-auc:0.925292+0.00126806719065
[70]	cv-test-auc:0.918672+0.00387675493164	cv-train-auc:0.928693+0.00129173557666
[80]	cv-test-auc:0.9225278+0.00316208636188	cv-train-auc:0.933196+0.000982082073963
[90]	cv-test-auc:0.9264412+0.00279111485969	cv-train-auc:0.9375554+0.000863600509495
[100]	cv-test-auc:0.9291182+0.00296929024516	cv-train-auc:0.9406486+0.000380363300017
[110]	cv-test-auc:0.93

[960]	cv-test-auc:0.9579118+0.00144739012018	cv-train-auc:0.9921462+0.000137421104638
[970]	cv-test-auc:0.9579522+0.00143768221802	cv-train-auc:0.992319+0.000127795148578
[980]	cv-test-auc:0.957967+0.00141948765405	cv-train-auc:0.9924556+0.000107373367275
[990]	cv-test-auc:0.9580234+0.00142161437809	cv-train-auc:0.9926152+0.000108567766856
[1000]	cv-test-auc:0.958048+0.00142028292956	cv-train-auc:0.9927556+0.000107964068097
[1010]	cv-test-auc:0.9580484+0.00141261411574	cv-train-auc:0.9928972+0.000110340201196
[1020]	cv-test-auc:0.9580764+0.00141485357546	cv-train-auc:0.9930348+0.000115829875248
[1030]	cv-test-auc:0.9581132+0.00141964381448	cv-train-auc:0.9931754+0.000118761273149
[1040]	cv-test-auc:0.9581652+0.00142925958454	cv-train-auc:0.9933166+0.000114967125736
[1050]	cv-test-auc:0.9581762+0.00141974144125	cv-train-auc:0.9934504+0.000117066818527
[1060]	cv-test-auc:0.9581888+0.00139504485949	cv-train-auc:0.993564+0.000115292671059
[1070]	cv-test-auc:0.958202+0.00139461722347	cv-tra

Time used: 756.944275614 s


Stopping. Best iteration:
[1621] cv-mean:0.9585918	cv-std:0.00139046559109


In [None]:
import winsound
winsound.Beep(600,1000)

In [16]:
model, pred = xgb_predict(train_feature, train_label, test_feature, iterations, params)

[0]	train-auc:0.850628
[30]	train-auc:0.911609
[60]	train-auc:0.924338
[90]	train-auc:0.936461
[120]	train-auc:0.943546
[150]	train-auc:0.950959
[180]	train-auc:0.956133
[210]	train-auc:0.960424
[240]	train-auc:0.963603
[270]	train-auc:0.966315
[300]	train-auc:0.968712
[330]	train-auc:0.970702
[360]	train-auc:0.972358
[390]	train-auc:0.974124
[420]	train-auc:0.975660
[450]	train-auc:0.977040
[480]	train-auc:0.978161
[510]	train-auc:0.979195
[540]	train-auc:0.980336
[570]	train-auc:0.981252
[600]	train-auc:0.982212
[630]	train-auc:0.983141
[660]	train-auc:0.983897
[690]	train-auc:0.984555
[720]	train-auc:0.985341
[750]	train-auc:0.985956
[780]	train-auc:0.986669
[810]	train-auc:0.987242
[840]	train-auc:0.987848
[870]	train-auc:0.988446
[900]	train-auc:0.988910
[930]	train-auc:0.989474
[960]	train-auc:0.989948
[990]	train-auc:0.990417
[1020]	train-auc:0.990861
[1050]	train-auc:0.991302
[1080]	train-auc:0.991672
[1110]	train-auc:0.992072
[1140]	train-auc:0.992454
[1170]	train-auc:0.992821

In [17]:
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
importance.to_csv('../data/output/feat_imp/importance-20180114-%f(r%d).csv' % (best_score, iterations), index = False)

In [18]:
res = store_result(test_index, pred, '20180114-xgb-%f(r%d)' % (best_score, iterations))