In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [3]:
train.head()

Unnamed: 0,uid,label,voice_opp_num_unique_count,voice_opp_num_count,voice_opp_head_unique_count,voice_opp_len_3,voice_opp_len_5,voice_opp_len_6,voice_opp_len_7,voice_opp_len_8,...,wa_everyday_visit_dura40,wa_everyday_visit_dura41,wa_everyday_visit_dura42,wa_everyday_visit_dura43,wa_everyday_visit_dura44,wa_everyday_visit_dura45,wa_most_up_name,wa_most_down_name,wa_most_cnt_name_x,wa_most_cnt_name_y
0,u0001,0,22.0,79.0,17.0,0.0,3.0,0.0,0.0,0.0,...,11866283.0,16167257.0,11671006.0,5768643.0,7333922.0,5916483.0,2954,8486,1147,2954
1,u0002,0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2953,9987,9987,9987
2,u0003,0,15.0,21.0,10.0,0.0,1.0,0.0,0.0,2.0,...,1269605.0,2005389.0,2048605.0,516007.0,1272488.0,8182261.0,8646,8646,1147,13224
3,u0004,0,77.0,254.0,31.0,0.0,1.0,0.0,0.0,12.0,...,3691438.0,2458712.0,2131038.0,1194165.0,1978577.0,1598939.0,8486,2953,8486,8486
4,u0005,0,55.0,401.0,28.0,0.0,4.0,0.0,0.0,0.0,...,765327.0,186448.0,75072.0,367893.0,1096541.0,46948.0,8486,3484,8486,8486


In [4]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [5]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
   # 'metric_freq': 100,
    'is_training_metric': False,
   # 'min_data_in_leaf': 10,
   # 'num_leaves': 110,
    'learning_rate': 0.04,
   # 'feature_fraction': 0.8,
   # 'bagging_fraction': 0.8,
    'verbosity':-1,
    #'max_depth':10,
    'colsample_bytree':0.7,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [6]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.38 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True
    

    

### 本地CV

In [7]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=2,num_boost_round=300,nfold=3,metrics=['evalMetric'])

[2]	cv_agg's res: 0.627864 + 0.00635839
[4]	cv_agg's res: 0.647349 + 0.00683341
[6]	cv_agg's res: 0.655395 + 0.00575598
[8]	cv_agg's res: 0.72857 + 0.00638308
[10]	cv_agg's res: 0.764771 + 0.0101047
[12]	cv_agg's res: 0.788543 + 0.00976711
[14]	cv_agg's res: 0.792541 + 0.0114646
[16]	cv_agg's res: 0.795711 + 0.0131166
[18]	cv_agg's res: 0.798488 + 0.0116641
[20]	cv_agg's res: 0.80005 + 0.0117604
[22]	cv_agg's res: 0.80323 + 0.0112637
[24]	cv_agg's res: 0.803143 + 0.0118278
[26]	cv_agg's res: 0.802665 + 0.0122246
[28]	cv_agg's res: 0.804579 + 0.00873222
[30]	cv_agg's res: 0.800205 + 0.00889997
[32]	cv_agg's res: 0.801513 + 0.00908604
[34]	cv_agg's res: 0.799301 + 0.00784993
[36]	cv_agg's res: 0.799155 + 0.0082346
[38]	cv_agg's res: 0.79969 + 0.00667894
[40]	cv_agg's res: 0.79759 + 0.00610608
[42]	cv_agg's res: 0.79859 + 0.00874608
[44]	cv_agg's res: 0.800086 + 0.0069073
[46]	cv_agg's res: 0.799215 + 0.00706033
[48]	cv_agg's res: 0.800989 + 0.00624849
[50]	cv_agg's res: 0.800865 + 0.0075

{'res-mean': [0.6003994737802532,
  0.6278643255840662,
  0.6312790581975808,
  0.6473492346320683,
  0.6565706829593164,
  0.6553947640051492,
  0.6817048727467805,
  0.7285704779811516,
  0.7509907015682705,
  0.7647710299462179,
  0.7764891389781008,
  0.7885432294696454,
  0.7905375141490806,
  0.7925413258651673,
  0.7945454524011005,
  0.7957109689215299,
  0.7989208612620168,
  0.7984880595241814,
  0.7991338220559318,
  0.8000498812558989,
  0.8004598581926011,
  0.8032297887670362,
  0.8027657991208527,
  0.8031429809821781,
  0.8031892967244549,
  0.8026652592583035,
  0.8017836962927033,
  0.8045793960591517,
  0.8042352635062563,
  0.8002051694793552,
  0.8018000344772959,
  0.8015134132095412,
  0.8019066483674736,
  0.7993007473811118,
  0.8001323508410637,
  0.7991551445184154,
  0.7990381226120543,
  0.799690399534971,
  0.7998340801735347,
  0.7975900742720808,
  0.7975343321468586,
  0.7985903440043539,
  0.7985659182866556,
  0.8000862742913201,
  0.7988911663782146,

## 训练

In [8]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=160,valid_sets=[dtrain])

[5]	training's res: 0.691704
[10]	training's res: 0.818274
[15]	training's res: 0.867842
[20]	training's res: 0.883755
[25]	training's res: 0.897508
[30]	training's res: 0.902822
[35]	training's res: 0.908178
[40]	training's res: 0.913437
[45]	training's res: 0.91808
[50]	training's res: 0.925106
[55]	training's res: 0.929457
[60]	training's res: 0.93508
[65]	training's res: 0.939591
[70]	training's res: 0.945587
[75]	training's res: 0.948055
[80]	training's res: 0.950928
[85]	training's res: 0.954627
[90]	training's res: 0.959378
[95]	training's res: 0.962898
[100]	training's res: 0.966548
[105]	training's res: 0.967999
[110]	training's res: 0.971207
[115]	training's res: 0.976273
[120]	training's res: 0.978454
[125]	training's res: 0.980764
[130]	training's res: 0.982321
[135]	training's res: 0.983878
[140]	training's res: 0.986704
[145]	training's res: 0.987904
[150]	training's res: 0.989787
[155]	training's res: 0.991429
[160]	training's res: 0.992804


### 预测

In [10]:
pred=model.predict(test.drop(['uid'],axis=1))

In [11]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [12]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.38 else 0)
#res.label = res.label.map(lambda x: int(x))

In [13]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])