In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [9]:
train.head()

Unnamed: 0,uid,label,voice_opp_num_unique_count,voice_opp_num_count,voice_opp_head_unique_count,voice_opp_len_3,voice_opp_len_5,voice_opp_len_6,voice_opp_len_7,voice_opp_len_8,...,visit_dura_x_y.10,visit_dura_y_y.10,visit_dura_x_y.11,visit_dura_y_y.11,visit_dura_x_y.12,visit_dura_y_y.12,visit_dura_x_y.13,visit_dura_y_y.13,visit_dura_x_y.14,visit_dura_y_y.14
0,u0001,0,22.0,79.0,17.0,0.0,3.0,0.0,0.0,0.0,...,47692.0,13190.0,17314.0,14337.0,34536.0,17084.0,4806.0,9402.0,3378084.0,8907.0
1,u0002,0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,u0003,0,15.0,21.0,10.0,0.0,1.0,0.0,0.0,2.0,...,25104.0,17164.0,18219.0,38717.0,1039108.0,440294.0,175357.0,2184.0,452927.0,2281.0
3,u0004,0,77.0,254.0,31.0,0.0,1.0,0.0,0.0,12.0,...,145443.0,22237.0,10694.0,513625.0,110245.0,440247.0,607104.0,15360.0,1971461.0,5034.0
4,u0005,0,55.0,401.0,28.0,0.0,4.0,0.0,0.0,0.0,...,2343340.0,0.0,0.0,4844.0,82539.0,9950.0,0.0,1958.0,174052.0,160872.0


In [10]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [11]:
lgb_params =  {
    'seed':20,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_training_metric': False,
    'learning_rate': 0.02,
    'verbosity':-1,
    'colsample_bytree':0.7,
}    

In [12]:
def evalMetric(preds,dtrain):
    label = dtrain.get_label()
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.38 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    res = 0.6*auc +0.4*f1
    return 'res',res,True

### 本地CV

In [13]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=2,num_boost_round=500,nfold=3,metrics=['evalMetric'])

[2]	cv_agg's res: 0.663585 + 0.0103579
[4]	cv_agg's res: 0.680356 + 0.00774793
[6]	cv_agg's res: 0.680689 + 0.00846072
[8]	cv_agg's res: 0.684709 + 0.00552496
[10]	cv_agg's res: 0.685678 + 0.00554762
[12]	cv_agg's res: 0.68611 + 0.00509372
[14]	cv_agg's res: 0.686972 + 0.00521933
[16]	cv_agg's res: 0.797144 + 0.0104404
[18]	cv_agg's res: 0.82359 + 0.0101907
[20]	cv_agg's res: 0.836238 + 0.0100632
[22]	cv_agg's res: 0.848341 + 0.00977314
[24]	cv_agg's res: 0.853307 + 0.00971357
[26]	cv_agg's res: 0.855552 + 0.0110753
[28]	cv_agg's res: 0.857632 + 0.0127426
[30]	cv_agg's res: 0.859293 + 0.0134749
[32]	cv_agg's res: 0.860283 + 0.0124292
[34]	cv_agg's res: 0.860734 + 0.0120151
[36]	cv_agg's res: 0.862182 + 0.0119389
[38]	cv_agg's res: 0.863586 + 0.0115863
[40]	cv_agg's res: 0.86446 + 0.0108569
[42]	cv_agg's res: 0.863546 + 0.00953623
[44]	cv_agg's res: 0.864273 + 0.00953257
[46]	cv_agg's res: 0.864322 + 0.0103776
[48]	cv_agg's res: 0.86408 + 0.0100104
[50]	cv_agg's res: 0.863725 + 0.010403

[406]	cv_agg's res: 0.876969 + 0.0118634
[408]	cv_agg's res: 0.876845 + 0.0125006
[410]	cv_agg's res: 0.877345 + 0.0125152
[412]	cv_agg's res: 0.877362 + 0.0134325
[414]	cv_agg's res: 0.876786 + 0.0123772
[416]	cv_agg's res: 0.87708 + 0.0122188
[418]	cv_agg's res: 0.877089 + 0.011548
[420]	cv_agg's res: 0.87771 + 0.0117793
[422]	cv_agg's res: 0.87785 + 0.0122647
[424]	cv_agg's res: 0.877205 + 0.0119006
[426]	cv_agg's res: 0.876891 + 0.0115657
[428]	cv_agg's res: 0.876614 + 0.0120708
[430]	cv_agg's res: 0.877163 + 0.0122369
[432]	cv_agg's res: 0.876855 + 0.0118801
[434]	cv_agg's res: 0.876062 + 0.0118874
[436]	cv_agg's res: 0.876504 + 0.0116362
[438]	cv_agg's res: 0.876354 + 0.0113301
[440]	cv_agg's res: 0.876529 + 0.0112141
[442]	cv_agg's res: 0.877247 + 0.0115977
[444]	cv_agg's res: 0.877311 + 0.012475
[446]	cv_agg's res: 0.87717 + 0.0120634
[448]	cv_agg's res: 0.877282 + 0.0121278
[450]	cv_agg's res: 0.877335 + 0.0124513
[452]	cv_agg's res: 0.877259 + 0.0125196
[454]	cv_agg's res: 0.

{'res-mean': [0.6602158275389156,
  0.6635846000377135,
  0.6705805311090552,
  0.680356136316227,
  0.6790886395124264,
  0.6806891388728294,
  0.6814282743834698,
  0.6847092689485922,
  0.6848818555012418,
  0.6856780650819497,
  0.6862912216059271,
  0.6861099913021375,
  0.6862710042393813,
  0.6869722207934013,
  0.772529043333852,
  0.7971438981468011,
  0.8139341050639785,
  0.8235903470657555,
  0.8334209407037124,
  0.8362381337428157,
  0.8437285465913114,
  0.8483409378902902,
  0.8507580987174842,
  0.8533074362946186,
  0.8535866494666394,
  0.8555517262350273,
  0.8569504520760711,
  0.8576316977895063,
  0.8588446851761026,
  0.8592930406004001,
  0.8596526268180865,
  0.8602828615930419,
  0.8605943642373047,
  0.8607339143027529,
  0.8630228591715193,
  0.8621816764163576,
  0.8631247493187203,
  0.8635864244940672,
  0.8641155928254766,
  0.8644601350794937,
  0.8651992996678644,
  0.8635455963586621,
  0.8639563453424591,
  0.8642727392788311,
  0.8633632403993996,


## 训练

In [16]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=100,num_boost_round=422,valid_sets=[dtrain])

[100]	training's res: 0.94436
[200]	training's res: 0.97928
[300]	training's res: 0.996895
[400]	training's res: 0.999778


### 预测

In [17]:
pred=model.predict(test.drop(['uid'],axis=1))

In [18]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [19]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.38 else 0)
#res.label = res.label.map(lambda x: int(x))

In [20]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])