In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_feature_V1.csv')
#test_a = pd.read_csv('../data/test_feature_a_V1.csv')
test = pd.read_csv('../data/test_feature_b_V1.csv')

In [3]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
#dtest_a = lgb.Dataset(test_a.drop(['uid'],axis=1))
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

# 设置参数

In [4]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq':1,
    'verbosity':-1,
}    

In [5]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre = pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds = pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    res = 0.6*auc + 0.4*f1
    
    return 'res',res,True

# 交叉验证

In [6]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.980784 + 0.000490742
[10]	cv_agg's res: 0.98573 + 0.000318488
[15]	cv_agg's res: 0.988381 + 0.000292508
[20]	cv_agg's res: 0.991135 + 0.000263388
[25]	cv_agg's res: 0.992865 + 0.000452885
[30]	cv_agg's res: 0.994366 + 0.000296659
[35]	cv_agg's res: 0.99551 + 0.000230299
[40]	cv_agg's res: 0.996425 + 0.000278859
[45]	cv_agg's res: 0.997364 + 0.000239461
[50]	cv_agg's res: 0.998004 + 5.23855e-05
[55]	cv_agg's res: 0.998431 + 0.000225212
[60]	cv_agg's res: 0.998723 + 9.20818e-05
[65]	cv_agg's res: 0.998915 + 0.000107255
[70]	cv_agg's res: 0.999076 + 9.88949e-05
[75]	cv_agg's res: 0.999257 + 1.60154e-05
[80]	cv_agg's res: 0.999376 + 1.84797e-05
[85]	cv_agg's res: 0.999478 + 1.14183e-05
[90]	cv_agg's res: 0.999545 + 1.11426e-05
[95]	cv_agg's res: 0.999637 + 1.11018e-05
[100]	cv_agg's res: 0.999658 + 1.9436e-05
[105]	cv_agg's res: 0.99974 + 3.24012e-05
[110]	cv_agg's res: 0.999766 + 2.64521e-05
[115]	cv_agg's res: 0.999798 + 2.95588e-05
[120]	cv_agg's res: 0.999826 + 3.15

{'res-mean': [0.9709924657498931,
  0.9778538566732355,
  0.9786718238469723,
  0.9801940197373463,
  0.9807835061219334,
  0.9822101704457703,
  0.9837441214926076,
  0.9848460951137978,
  0.9850573974931995,
  0.9857297360538232,
  0.9863650127831293,
  0.9868405928413854,
  0.9867972938597368,
  0.9878048253420165,
  0.9883809055359842,
  0.9894012019451877,
  0.9895815540690261,
  0.989902774784762,
  0.9903767579417617,
  0.9911347107952203,
  0.9913151731568796,
  0.9916611073272737,
  0.9919434665529329,
  0.9924142672338795,
  0.9928646632963681,
  0.9932901139860357,
  0.9935099623584298,
  0.9936926454481504,
  0.993906680557286,
  0.9943661929245082,
  0.9945836569237793,
  0.9947925873227526,
  0.995092074740457,
  0.9953569895123798,
  0.9955101763109679,
  0.9957030223032883,
  0.9958931182100318,
  0.9959737119937241,
  0.9962503541914148,
  0.9964252191154271,
  0.9965546706965532,
  0.996799467119355,
  0.9970921863158777,
  0.9971603561693412,
  0.9973638053177876,
  

# 训练

In [7]:
model = lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.981662
[10]	training's res: 0.98566
[15]	training's res: 0.987957
[20]	training's res: 0.99062
[25]	training's res: 0.992859
[30]	training's res: 0.99371
[35]	training's res: 0.99529
[40]	training's res: 0.996716
[45]	training's res: 0.997457
[50]	training's res: 0.998002
[55]	training's res: 0.998439
[60]	training's res: 0.998687
[65]	training's res: 0.998988
[70]	training's res: 0.999128
[75]	training's res: 0.999248
[80]	training's res: 0.999348
[85]	training's res: 0.999506
[90]	training's res: 0.999667
[95]	training's res: 0.999716
[100]	training's res: 0.999761
[105]	training's res: 0.999782
[110]	training's res: 0.999794
[115]	training's res: 0.999839
[120]	training's res: 0.999879
[125]	training's res: 0.999918
[130]	training's res: 0.999934
[135]	training's res: 0.999966
[140]	training's res: 0.999971
[145]	training's res: 0.99997
[150]	training's res: 0.999976
[155]	training's res: 0.99998
[160]	training's res: 0.999982
[165]	training's res: 0.999986
[17

# 预测

In [8]:
pred = model.predict(test.drop(['uid'],axis=1))

In [9]:
res = pd.DataFrame({'uid':test.uid,'label':pred})

In [10]:
res = res.sort_values(by='label',ascending=False)
res.label = res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [11]:
res.to_csv('../result/result.csv',index=False,header=False,sep=',',columns=['uid','label'])