In [None]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
action_tr = pd.read_csv('../data/input/train/action_train.csv')  # 用户行为数据
order_future_tr = pd.read_csv('../data/input/train/orderFuture_train.csv')  # 待预测数据
order_history_tr = pd.read_csv('../data/input/train/orderHistory_train.csv')  # 用户历史订单数据
user_comment_tr = pd.read_csv('../data/input/train/userComment_train.csv')  # 用户评论数据
user_profile_tr = pd.read_csv('../data/input/train/userProfile_train.csv')  # 用户个人信息

action_te = pd.read_csv('../data/input/test/action_test.csv')
order_future_te = pd.read_csv('../data/input/test/orderFuture_test.csv')
order_history_te = pd.read_csv('../data/input/test/orderHistory_test.csv')
user_comment_te = pd.read_csv('../data/input/test/userComment_test.csv')
user_profile_te = pd.read_csv('../data/input/test/userProfile_test.csv')

In [None]:
def translate_date(date):
    year = int(date[:4])
    month = int(date[-2:])
    return (year - 2010) * 12 + month

In [None]:
def cal_time_gap(t):
    s = '2018-01-01'
    return time.mktime(datetime.datetime.strptime(s, "%Y-%m-%d").timetuple()) - t

In [None]:
def get_interaction_feature(df, feature_A, feature_B):
    feature_A_list = sorted(df[feature_A].unique())
    feature_B_list = sorted(df[feature_B].unique())
    count = 0
    mydict = {}
    for i in feature_A_list:
        mydict[int(i)] = {}
        for j in feature_B_list:
            mydict[int(i)][int(j)] = count
            count += 1
    return df.apply(lambda x: mydict[int(x[feature_A])][int(x[feature_B])], axis=1)

In [None]:
def print_label_encoder_mapping(le):
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    for k, v in le_name_mapping.items():
        print str(k) + ': ' + str(v)

In [None]:
def get_user_profile_feature(df):
    df = df.copy()

    mydf = df[['userid']]
    le = preprocessing.LabelEncoder()
#     mydf['gender'] = le.fit_transform(df['gender'])

    mydf['province'] = le.fit_transform(df['province'])

    mydf['age'] = le.fit_transform(df['age'])

    return mydf

In [None]:
def get_action_feature(df):
    df = df.copy()
    
    # 没包含
    mydf = df.groupby('userid').size().reset_index()
    mydf.columns = ['userid', 'action_num']

    act_type_last = mydf[['userid']]
    for i in [1, 2, 3]:
        tmp = df.groupby('userid').nth(-i)['actionType'].reset_index()
        tmp.columns = ['userid', 'act_type_last' + str(i)]
        act_type_last = pd.merge(act_type_last, tmp, how='left', on='userid')

    act_type_first1 = df.groupby('userid').nth(1)['actionType'].reset_index()
    act_type_first1.columns = ['userid', 'act_type_first1']

    time_gap_stat = cal_time_gap(df['actionTime']).groupby(df['userid']).agg([np.mean, np.std, min]).reset_index()
    time_gap_stat.columns = ['userid', 'action_time_gap_mean', 'action_time_gap_std', 'action_time_gap_min']

    time_gap_last = mydf[['userid']]
    for i in [1, 2, 3, 4]:
        tmp = cal_time_gap(df['actionTime']).groupby(df['userid']).nth(-i).reset_index()
        tmp.columns = ['userid', 'time_gap_last' + str(i)]
        time_gap_last = pd.merge(time_gap_last, tmp, how='left', on='userid')

    time_gap_first1 = cal_time_gap(df['actionTime']).groupby(df['userid']).nth(1).reset_index()
    time_gap_first1.columns = ['userid', 'time_gap_first1']

    time_gap_last123 = cal_time_gap(df['actionTime']).groupby(df['userid']).nth([-1, -2, -3]).reset_index()
    time_gap_last123_stat = time_gap_last123.groupby('userid')['actionTime'].agg([np.mean, np.std]).reset_index()
    time_gap_last123_stat.columns = ['userid', 'time_gap_last123_mean', 'time_gap_last123_std']

    df['dist'] = df.groupby('userid')['actionTime'].rank(ascending=False)
    dist = mydf[['userid']]
    for i in [9, 3, 5, 8]:
        tmp = df[df['actionType'] == i].groupby('userid')['dist'].nth(-1).reset_index()
        tmp.columns = ['userid', 'dist' + str(i)]
        dist = pd.merge(dist, tmp, how='left', on='userid')

    time_gap_all_type = mydf[['userid']]
    for i in range(1, 10):
        tmp = df[df['actionType'] == i].groupby('userid')['actionTime'].nth(-1).reset_index()
        tmp['actionTime'] = cal_time_gap(tmp['actionTime'])
        tmp.columns = ['userid', 'time_gap' + str(i)]
        time_gap_all_type = pd.merge(time_gap_all_type, tmp, how='left', on='userid')

    # 构建时间间隔序列 time_gap
    sub = df.loc[:(len(df) - 2), 'actionTime']  # 构建减数 Serise
    sub.index = sub.index + 1  # 索引加一
    df['time_gap'] = df.loc[1:, 'actionTime'] - sub  # 计算时间间隔
    tmp = df.groupby('userid').nth(0).reset_index()  # 找到用户的第一个 action
    tmp['time_gap'] = 0  # 把所有用户的第一个 action 的 time_gap 赋 0
    df = pd.merge(df, tmp, how='left', on=['userid', 'actionTime', 'actionType', 'dist']).fillna(1)  # merge 起来后，空值填充 1
    df['time_gap'] = df['time_gap_x'] * df['time_gap_y']  # 两列相乘，将所有用户的第一个 action 的 time_gap 赋 0
    df = df.drop(['time_gap_x', 'time_gap_y'], axis=1)  # 移除多余的列
    df = df.replace(0, np.nan)  # 0 -> NaN

    time_gap_seq_stat = mydf[['userid']]
    for i in range(5, 10):
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap'].std().reset_index()
        tmp.columns = ['userid', 'time_gap_seq' + str(i) + '_std']
        time_gap_seq_stat = pd.merge(time_gap_seq_stat, tmp, how='left', on='userid')

    for i in [2, 3, 5, 6, 7, 8, 9]:
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap'].mean().reset_index()
        tmp.columns = ['userid', 'time_gap_seq' + str(i) + '_mean']
        time_gap_seq_stat = pd.merge(time_gap_seq_stat, tmp, how='left', on='userid')

    for i in [2, 3, 4, 5, 6, 7, 8]:
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap'].min().reset_index()
        tmp.columns = ['userid', 'time_gap_seq' + str(i) + '_min']
        time_gap_seq_stat = pd.merge(time_gap_seq_stat, tmp, how='left', on='userid')

    for i in [5, 6, 7]:
        lo = df[df['actionType'] == i].reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        hi = df.reset_index().groupby('userid').nth(-1).reset_index()[['userid', 'index']]
        lo_hi = pd.merge(lo, hi, how='left', on='userid')
        lo_hi.columns = ['userid', 'lo_ind', 'hi_ind']
        index_list = []
        for index, row in lo_hi.iterrows():
            index_list += range(row['lo_ind'], row['hi_ind'] + 1)
        tmp = df.iloc[index_list,:]
        tmp = tmp.groupby('userid')['time_gap'].max().reset_index()
        tmp.columns = ['userid', 'time_gap_seq' + str(i) + '_max']
        time_gap_seq_stat = pd.merge(time_gap_seq_stat, tmp, how='left', on='userid')

    time_gap_seq_stat['time_gap_seq9_mean*std'] = time_gap_seq_stat['time_gap_seq9_mean'] * time_gap_seq_stat['time_gap_seq9_std']

    type_rate = mydf.copy()
    for i in range(1, 10):
        tmp = df[df['actionType'] == i].groupby('userid').size().reset_index()
        tmp.columns = ['userid', 'type' + str(i) + '_rate']
        type_rate = pd.merge(type_rate, tmp, how='left', on='userid')
        type_rate['type' + str(i) + '_rate'] = type_rate['type' + str(i) + '_rate'] / type_rate['action_num']
    type_rate = type_rate.drop('action_num', axis=1)

    mydf = pd.merge(mydf, act_type_last, how='left', on='userid')
    mydf = pd.merge(mydf, act_type_first1, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_stat, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_last, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_first1, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_last123, how='left', on='userid')
    mydf = pd.merge(mydf, dist, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_all_type, how='left', on='userid')
    mydf = pd.merge(mydf, time_gap_seq_stat, how='left', on='userid')
    mydf = pd.merge(mydf, type_rate, how='left', on='userid')

    mydf = mydf.drop('action_num', axis=1)

    return mydf

In [None]:
def get_user_comment_feature(df):
    df = df.copy()
    
    

    return mydf

In [None]:
def get_order_history_feature(df):
    df = df.copy()
    
    mydf = df.groupby('userid')['orderType'].max().reset_index()
    mydf.columns = ['userid', 'if_order']

    return mydf

In [None]:
user_profile_feat_tr = get_user_profile_feature(user_profile_tr)
action_feat_tr = get_action_feature(action_tr)
order_history_feat_tr = get_order_history_feature(order_history_tr)

dataset_tr = pd.merge(user_profile_feat_tr, action_feat_tr, on='userid', how='left')
dataset_tr = pd.merge(dataset_tr, order_history_feat_tr, on='userid', how='left')

trainset = pd.merge(order_future_tr, dataset_tr, on='userid', how='left')

train_feature = trainset.drop(['userid', 'orderType'], axis=1)
train_label = trainset.orderType.values

print train_feature.shape, train_label.shape

In [None]:
user_profile_feat_te = get_user_profile_feature(user_profile_te)
action_feat_te = get_action_feature(action_te)
order_history_feat_te = get_order_history_feature(order_history_te)

dataset_te = pd.merge(user_profile_feat_te, action_feat_te, on='userid', how='left')
dataset_te = pd.merge(dataset_te, order_history_feat_te, on='userid', how='left')

testset = pd.merge(order_future_te, dataset_te, on='userid', how='left')

test_feature = testset.drop(['userid'], axis=1)
test_index = testset.userid.values

print test_feature.shape

In [None]:
config = {
    'rounds': 10000,
    'folds': 5
}

params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'stratified': True,
    'scale_pos_weights ': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'gamma': 0,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'lambda': 1,

    'eta': 0.02,
    'seed': 20,
    'silent': 1,
    'eval_metric': 'auc'
}

In [None]:
def xgb_cv(train_feature, train_label, params, folds, rounds):
    start = time.clock()
    print train_feature.columns
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    num_round = rounds
    print 'run cv: ' + 'round: ' + str(rounds)
    res = xgb.cv(params, dtrain, num_round, nfold=folds, verbose_eval=10, early_stopping_rounds=100)
    elapsed = (time.clock() - start)
    print 'Time used:', elapsed, 's'
    return len(res), res.loc[len(res) - 1, 'test-auc-mean']


def xgb_predict(train_feature, train_label, test_feature, rounds, params):
    params['scale_pos_weights '] = float(len(train_label[train_label == 0])) / len(train_label[train_label == 1])
    dtrain = xgb.DMatrix(train_feature, label=train_label)
    dtest = xgb.DMatrix(test_feature, label=np.zeros(test_feature.shape[0]))
    watchlist = [(dtrain, 'train')]
    num_round = rounds
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval=30)
    predict = model.predict(dtest)
    return model, predict


def store_result(test_index, pred, name):
    result = pd.DataFrame({'userid': test_index, 'orderType': pred})
    result.to_csv('../data/output/sub/' + name + '.csv', index=0, columns=['userid', 'orderType'])
    return result

In [None]:
iterations, best_score = xgb_cv(train_feature, train_label, params, config['folds'], config['rounds'])

In [None]:
import winsound
winsound.Beep(600,1000)

In [None]:
iterations = 1200

In [None]:
model, pred = xgb_predict(train_feature, train_label, test_feature, iterations, params)

In [None]:
importance = pd.DataFrame(model.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)
importance.to_csv('../data/output/feat_imp/importance-20180112-%f(r%d).csv' % (best_score, iterations), index = False)

In [None]:
res = store_result(test_index, pred, '20180112-xgb-%f(r%d)' % (best_score, iterations))