In [2]:
import pandas as pd
import math
import numpy as np
import copy
import warnings
import gc
from sklearn import preprocessing
warnings.filterwarnings('ignore')
feature_path ='./feature/'
res_path = './res/'
data_path = './data/'

In [3]:
transaction_df = pd.read_csv(data_path+'transaction_train_new.csv')
operation_df =  pd.read_csv(data_path+'operation_train_new.csv')
label= pd.read_csv(data_path+'tag_train_new.csv')

transaction_test = pd.read_csv(data_path+'transaction_round1_new.csv')
operation_test = pd.read_csv(data_path+'operation_round1_new.csv')
sample = pd.read_csv(data_path+'sample.csv')

In [4]:
def merge_count(df1,df2,columns,value,cname):
    add = df1.groupby(columns)[value].count().reset_index().rename(columns = {value:cname})
    df2=df2.merge(add,on=columns,how="left")
    del add
    gc.collect()
    return df2

def merge_nunique(df1,df2,columns,value,cname):
    add = df1.groupby(columns)[value].nunique().reset_index().rename(columns = {value:cname})
    df2=df2.merge(add,on=columns,how="left")
    del add
    gc.collect()
    return df2

def merge_value_count(df1,df2,col,value):
    tmp = df1.groupby(col)[value].count().reset_index().rename(columns = {value:'cnt'})
    df = tmp.pivot(index=col[0],columns=col[1],values='cnt').reset_index()
    cname = [col[0]]
    for index in range(1,len(df.columns)):
        cname.append(str(col[1])+'_'+str(df.columns[index]))
    df.columns=cname
    df = df.fillna(0)
    df2 = df2.merge(df,on=str(col[0]),how='left')
    del df,df1
    gc.collect()
    return df2

In [5]:
def get_op_fea(operation_df):
    #op_day
    op_fea = operation_df[['UID']].drop_duplicates()
    tmp = operation_df.groupby('UID')['day'].agg([max,min,np.mean]).reset_index()
    tmp.columns=['UID','op_day_max','op_day_min','op_day_mean']
    op_fea = pd.merge(op_fea,tmp,on='UID',how='left')
    #op_mode count
    op_fea = merge_count(operation_df,op_fea,'UID','mode','op_cnt')
    op_fea = merge_nunique(operation_df,op_fea,'UID','mode','op_mode_nunique')
    #success count
    op_fea = merge_count(operation_df[operation_df.success==0],op_fea,'UID','mode','op_fail_cnt')
    op_fea = merge_count(operation_df[operation_df.success==1],op_fea,'UID','mode','op_success_cnt')
    op_fea['op_fail_cnt'] = op_fea['op_fail_cnt'].fillna(0)
    op_fea['op_success_cnt'] = op_fea['op_success_cnt'].fillna(0)
    #op_time
    operation_df['op_hour'] = operation_df['time'].apply(lambda x:int(x.split(':')[0]))
    tmp = operation_df.groupby('UID')['op_hour'].agg([max,min,np.mean]).reset_index()
    tmp.columns=['UID','op_hour_max','op_hour_min','op_hour_mean']
    op_fea = pd.merge(op_fea,tmp,on='UID',how='left')
    #op_os
    for col in ['os','version','device1','device2','device_code1','device_code2','mac1','ip1','ip2','device_code3','mac2','wifi','geo_code','ip1_sub','ip2_sub']:
        op_fea = merge_nunique(operation_df,op_fea,'UID',col,'op_'+col+'_nunique')
    return op_fea

def get_trans_fea(transaction_df):
    trans_fea = transaction_df[['UID']].drop_duplicates()
    #trans_channel
    trans_fea = merge_value_count(transaction_df,trans_fea,['UID','channel'],'day')
    trans_fea = merge_count(transaction_df,trans_fea,'UID','channel','trans_cnt')
    trans_fea = merge_nunique(transaction_df,trans_fea,'UID','channel','trans_channel_nunique')

    for col in ['trans_type2','market_type']:
        trans_fea = merge_value_count(transaction_df,trans_fea,['UID',col],'day')
        trans_fea = merge_nunique(transaction_df,trans_fea,'UID',col,'trans_'+col+'_nunique')
    for col in ['trans_type1','merchant','code1','code2','acc_id1','device_code1','device_code2','device_code3','device1','device2','mac1','ip1','acc_id2','acc_id3','geo_code','market_code','ip1_sub']:
        trans_fea = merge_nunique(transaction_df,trans_fea,'UID',col,'trans_'+col+'_nunique')
    return trans_fea


In [6]:
import xgboost as xgb
#train xgb
config = {
    'rounds': 10000,
    'folds': 5
}

params = {
    'booster':'gbtree',
    'objective':'binary:logistic',
    'stratified':True,
    'max_depth':5,
    'min_child_weight':1,
    'gamma':3,
    'subsample':0.8,#0.7
    'colsample_bytree':0.6, 
    'lambda':3, 
    'eta':0.05,
    'seed':20,
    'silent':1,
    'eval_metric':'auc'
}

def customedscore1(preds, dtrain):
    label = dtrain.get_label()
    d = pd.DataFrame()
    d['prob'] = list(preds)
    d['y'] = list(label)
    d = d.sort_values(['prob'], ascending=[0])
    y = d.y
    PosAll = pd.Series(y).value_counts()[1]
    NegAll = pd.Series(y).value_counts()[0]
    
    pCumsum = d['y'].cumsum()
    nCumsum = np.arange(len(y)) - pCumsum + 1
    pCumsumPer = pCumsum / PosAll
    nCumsumPer = nCumsum / NegAll
    
    TR1 = pCumsumPer[abs(nCumsumPer-0.001).idxmin()]
    TR2 = pCumsumPer[abs(nCumsumPer-0.005).idxmin()]
    TR3 = pCumsumPer[abs(nCumsumPer-0.01).idxmin()]
    score = 0.4 * TR1 + 0.3 * TR2 + 0.3 * TR3
    return 'SCORE',float(score)


def xgbCV(trainFeature, trainLabel, params, rounds):

    dtrain = xgb.DMatrix(trainFeature, label = trainLabel)
    params['scale_pos_weights '] = (float)(len(trainLabel[trainLabel == 0]))/(float)(len(trainLabel[trainLabel == 1]))
    num_round =rounds
    print ('run cv: ' + 'round: ' + str(rounds))
    res = xgb.cv(params, dtrain, num_round, verbose_eval = 100,early_stopping_rounds=200,nfold=3,feval = customedscore1)
    return res

def xgbPredict(trainFeature,trainLabel,testFeature,rounds,params):
    params['scale_pos_weights '] = (float)(len(trainLabel[trainLabel == 0]))/len(trainLabel[trainLabel == 1])
    
    dtrain = xgb.DMatrix(trainFeature.values, label = trainLabel)
    dtest = xgb.DMatrix(testFeature.values)

    watchlist  = [(dtrain,'train')]
    num_round = rounds
    
    model = xgb.train(params, dtrain, num_round, watchlist, verbose_eval = 50,feval = customedscore1)
    predict = model.predict(dtest)
    return model,predict

In [7]:
op_fea = get_op_fea(operation_df)
trans_fea = get_trans_fea(transaction_df)

op_fea_test = get_op_fea(operation_test)
trans_fea_test = get_trans_fea(transaction_test)

In [9]:
all_fea = trans_fea.merge(op_fea,on='UID',how='outer')
trainData = all_fea.merge(label,on='UID',how='left')
trainFeature = trainData.drop(['Tag','UID'],axis=1)
trainLabel = trainData.Tag

In [10]:
cv_res = xgbCV(trainFeature,trainLabel,params,10000)
cv_res['test-SCORE-mean'][-1:]

run cv: round: 10000
[0]	train-SCORE:0.401762+0.0729671	train-auc:0.938601+0.00894842	test-SCORE:0.364175+0.0470441	test-auc:0.933516+0.00897566
[100]	train-SCORE:0.869608+0.0151333	train-auc:0.995377+0.000640134	test-SCORE:0.752132+0.0125843	test-auc:0.991154+0.00117751
[200]	train-SCORE:0.928019+0.00902273	train-auc:0.998277+0.000143885	test-SCORE:0.787762+0.0279574	test-auc:0.993611+0.00139236
[300]	train-SCORE:0.946256+0.00403146	train-auc:0.998892+4.80578e-05	test-SCORE:0.801134+0.0304579	test-auc:0.993818+0.00152266
[400]	train-SCORE:0.952818+0.0034428	train-auc:0.999106+4.64351e-05	test-SCORE:0.808311+0.0248582	test-auc:0.993856+0.00151623
[500]	train-SCORE:0.957715+0.00388583	train-auc:0.999244+4.20264e-05	test-SCORE:0.802253+0.0211957	test-auc:0.9939+0.00160269
[600]	train-SCORE:0.960326+0.00353013	train-auc:0.999321+4.24604e-05	test-SCORE:0.803408+0.0233752	test-auc:0.993972+0.00157673
[700]	train-SCORE:0.963036+0.00396377	train-auc:0.99937+3.96344e-05	test-SCORE:0.801981+0.0

649    0.801264
Name: test-SCORE-mean, dtype: float64

In [11]:
all_fea_test = trans_fea_test.merge(op_fea_test,on='UID',how='outer')
testFeature = sample[['UID']].merge(all_fea_test,on='UID',how='left')
sub_id = testFeature['UID']
testFeature =testFeature.drop('UID',axis=1)

In [12]:
model,predict = xgbPredict(trainFeature,trainLabel,testFeature,700,params)

[0]	train-auc:0.929966	train-SCORE:0.492539
[50]	train-auc:0.988323	train-SCORE:0.77873
[100]	train-auc:0.995187	train-SCORE:0.862369
[150]	train-auc:0.997445	train-SCORE:0.912762
[200]	train-auc:0.998315	train-SCORE:0.930694
[250]	train-auc:0.998697	train-SCORE:0.945092
[300]	train-auc:0.998933	train-SCORE:0.953469
[350]	train-auc:0.999085	train-SCORE:0.958966
[400]	train-auc:0.999177	train-SCORE:0.961191
[450]	train-auc:0.99926	train-SCORE:0.964202
[500]	train-auc:0.999321	train-SCORE:0.965969
[550]	train-auc:0.999362	train-SCORE:0.967081
[600]	train-auc:0.999392	train-SCORE:0.968521
[650]	train-auc:0.999416	train-SCORE:0.969503
[699]	train-auc:0.999435	train-SCORE:0.969961


In [13]:
sub = pd.DataFrame()
sub['UID'] = sub_id
sub['Tag'] = predict
sub.head()

Unnamed: 0,UID,Tag
0,30000,0.035195
1,30001,0.030558
2,30002,0.00058
3,30003,0.024343
4,30004,0.000124


In [14]:
sub.to_csv(res_path+'baseline.csv',index=0)