In [1]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split #training and testing data split

In [5]:
train = pd.read_csv('train_featurev2.csv')
test = pd.read_csv('test_featurev2.csv')

In [6]:
train_x = train.drop(['uid','label'],axis=1)
train_y = train.label

xgb_test = xgb.DMatrix(test.drop(['uid'],axis=1))

X, val_X, y, val_y = train_test_split(
    train_x,
    train_y,
    test_size=0.01,
    random_state=1,
    stratify=train_y
)

xgb_val = xgb.DMatrix(val_X, label=val_y)
xgb_train = xgb.DMatrix(X, label=y)

In [4]:
params = {
    'booster': 'gbtree',
    # 'objective': 'multi:softmax',  # 多分类的问题、
    # 'objective': 'multi:softprob',   # 多分类概率
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    # 'num_class': 9,  # 类别数，与 multisoftmax 并用
    'gamma': 0.1,  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 8,  # 构建树的深度，越大越容易过拟合
    'alpha': 0,   # L1正则化系数
    'lambda': 10,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.7,  # 随机采样训练样本
    'colsample_bytree': 0.5,  # 生成树时进行的列采样
    'min_child_weight': 3,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    # ，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    # 这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'silent': 0,  # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.03,  # 如同学习率
    'seed': 1000,
    'nthread': 4,  # cpu 线程数
    'missing': -999,#删掉含-999值的特征
    'scale_pos_weight': (np.sum(y==0)/np.sum(y==1))  # 用来处理正负样本不均衡的问题,通常取：sum(negative cases) / sum(positive cases)
    # 'eval_metric': 'auc'
}

plst = list(params.items())

In [10]:
xgb.cv(plst, xgb_train, num_boost_round=100000, nfold=5,early_stopping_rounds=200,
       verbose_eval=True,folds=StratifiedKFold(n_splits=4).split(X, y))

[0]	train-auc:0.884121+0.00352921	test-auc:0.833683+0.0124563
[1]	train-auc:0.903667+0.00150154	test-auc:0.854145+0.0140947
[2]	train-auc:0.918819+0.00760368	test-auc:0.862682+0.0158631
[3]	train-auc:0.924646+0.00727596	test-auc:0.866684+0.0133764
[4]	train-auc:0.930176+0.00676429	test-auc:0.87075+0.0135406
[5]	train-auc:0.936726+0.00378167	test-auc:0.877961+0.00966246
[6]	train-auc:0.940729+0.00336684	test-auc:0.883403+0.00765014
[7]	train-auc:0.942778+0.00312163	test-auc:0.887185+0.00613436
[8]	train-auc:0.944148+0.00266823	test-auc:0.888014+0.00637841
[9]	train-auc:0.946184+0.00188074	test-auc:0.890319+0.00665201
[10]	train-auc:0.947589+0.00240545	test-auc:0.891607+0.0055692
[11]	train-auc:0.948175+0.00237549	test-auc:0.8906+0.00687401
[12]	train-auc:0.949454+0.00167707	test-auc:0.891468+0.00663534
[13]	train-auc:0.950314+0.00202296	test-auc:0.891709+0.00791747
[14]	train-auc:0.950676+0.00164138	test-auc:0.891907+0.00700174
[15]	train-auc:0.951417+0.00156678	test-auc:0.892419+0.0066

[128]	train-auc:0.985142+0.00066666	test-auc:0.909927+0.00541546
[129]	train-auc:0.985324+0.000625005	test-auc:0.910072+0.00543649
[130]	train-auc:0.985495+0.000623101	test-auc:0.910182+0.00536851
[131]	train-auc:0.985641+0.00063626	test-auc:0.910276+0.00537866
[132]	train-auc:0.985761+0.000654609	test-auc:0.910253+0.00538834
[133]	train-auc:0.985944+0.000653927	test-auc:0.910393+0.00541671
[134]	train-auc:0.986099+0.000631399	test-auc:0.910381+0.00549026
[135]	train-auc:0.986229+0.000624503	test-auc:0.910552+0.00541955
[136]	train-auc:0.986376+0.000645229	test-auc:0.910755+0.00540175
[137]	train-auc:0.986511+0.000624054	test-auc:0.910794+0.005432
[138]	train-auc:0.986675+0.000618617	test-auc:0.91084+0.00543995
[139]	train-auc:0.986852+0.000597135	test-auc:0.9109+0.00534995
[140]	train-auc:0.986971+0.000608087	test-auc:0.910921+0.00552372
[141]	train-auc:0.987113+0.000636736	test-auc:0.910995+0.00544472
[142]	train-auc:0.987235+0.000661992	test-auc:0.910976+0.00544534
[143]	train-auc:0

[253]	train-auc:0.997003+0.000218255	test-auc:0.912502+0.00555308
[254]	train-auc:0.99705+0.000201664	test-auc:0.912484+0.00556371
[255]	train-auc:0.997101+0.000196345	test-auc:0.912485+0.00556025
[256]	train-auc:0.997138+0.000197126	test-auc:0.912452+0.00553066
[257]	train-auc:0.997175+0.000197166	test-auc:0.91245+0.00552876
[258]	train-auc:0.997225+0.000188384	test-auc:0.912477+0.00557725
[259]	train-auc:0.997254+0.000176992	test-auc:0.912529+0.00559577
[260]	train-auc:0.997294+0.000163551	test-auc:0.912495+0.00559657
[261]	train-auc:0.997327+0.000166797	test-auc:0.912445+0.00558123
[262]	train-auc:0.997364+0.000165827	test-auc:0.912418+0.00554424
[263]	train-auc:0.997401+0.000163786	test-auc:0.912462+0.00550041
[264]	train-auc:0.997432+0.000162563	test-auc:0.912425+0.00543714
[265]	train-auc:0.997463+0.000164938	test-auc:0.912452+0.00546014
[266]	train-auc:0.997508+0.00015858	test-auc:0.912379+0.00550186
[267]	train-auc:0.997538+0.00015299	test-auc:0.912384+0.00548543
[268]	train-au

[378]	train-auc:0.999434+0.00010378	test-auc:0.912099+0.00467874
[379]	train-auc:0.999437+0.000104362	test-auc:0.912049+0.00469869
[380]	train-auc:0.999443+0.000104906	test-auc:0.912068+0.0047406
[381]	train-auc:0.999452+9.84358e-05	test-auc:0.912006+0.00468818
[382]	train-auc:0.999463+9.78848e-05	test-auc:0.912032+0.00464838
[383]	train-auc:0.999471+0.000103363	test-auc:0.912031+0.0046424
[384]	train-auc:0.999477+0.000100364	test-auc:0.911999+0.00462651
[385]	train-auc:0.999486+9.75738e-05	test-auc:0.911998+0.00469245
[386]	train-auc:0.999492+9.56778e-05	test-auc:0.911998+0.00464199
[387]	train-auc:0.999499+9.70925e-05	test-auc:0.912002+0.00459604
[388]	train-auc:0.999505+9.56648e-05	test-auc:0.912001+0.00458424
[389]	train-auc:0.999511+9.72819e-05	test-auc:0.911949+0.00458042
[390]	train-auc:0.999517+9.51912e-05	test-auc:0.911955+0.00454128
[391]	train-auc:0.999524+9.27504e-05	test-auc:0.911963+0.00455694
[392]	train-auc:0.999531+9.28409e-05	test-auc:0.911939+0.00451736
[393]	train-a

Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.833683,0.012456,0.884121,0.003529
1,0.854145,0.014095,0.903667,0.001502
2,0.862682,0.015863,0.918819,0.007604
3,0.866684,0.013376,0.924646,0.007276
4,0.870750,0.013541,0.930176,0.006764
5,0.877961,0.009662,0.936726,0.003782
6,0.883403,0.007650,0.940729,0.003367
7,0.887185,0.006134,0.942778,0.003122
8,0.888014,0.006378,0.944148,0.002668
9,0.890319,0.006652,0.946184,0.001881


In [11]:
num_rounds=2000
watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
xgb_model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=200)
xgb_model.save_model('xgb1.model')

[0]	train-auc:0.867303	val-auc:0.814363
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 200 rounds.
[1]	train-auc:0.919482	val-auc:0.795393
[2]	train-auc:0.931091	val-auc:0.846883
[3]	train-auc:0.936924	val-auc:0.815718
[4]	train-auc:0.938752	val-auc:0.848238
[5]	train-auc:0.940132	val-auc:0.852304
[6]	train-auc:0.941648	val-auc:0.855014
[7]	train-auc:0.942811	val-auc:0.863144
[8]	train-auc:0.943899	val-auc:0.865854
[9]	train-auc:0.946148	val-auc:0.863144
[10]	train-auc:0.946525	val-auc:0.872629
[11]	train-auc:0.947828	val-auc:0.883469
[12]	train-auc:0.948857	val-auc:0.880759
[13]	train-auc:0.950949	val-auc:0.878049
[14]	train-auc:0.951957	val-auc:0.869919
[15]	train-auc:0.952627	val-auc:0.875339
[16]	train-auc:0.952868	val-auc:0.875339
[17]	train-auc:0.953111	val-auc:0.875339
[18]	train-auc:0.953812	val-auc:0.878049
[19]	train-auc:0.95431	val-auc:0.869919
[20]	train-auc:0.954614	val-auc:0.867209
[21]	train

[195]	train-auc:0.991559	val-auc:0.872629
[196]	train-auc:0.991665	val-auc:0.872629
[197]	train-auc:0.991785	val-auc:0.872629
[198]	train-auc:0.991869	val-auc:0.869919
[199]	train-auc:0.991949	val-auc:0.869919
[200]	train-auc:0.992056	val-auc:0.872629
[201]	train-auc:0.992195	val-auc:0.872629
[202]	train-auc:0.99229	val-auc:0.875339
[203]	train-auc:0.992417	val-auc:0.869919
[204]	train-auc:0.992515	val-auc:0.872629
[205]	train-auc:0.992606	val-auc:0.872629
[206]	train-auc:0.992671	val-auc:0.872629
[207]	train-auc:0.992749	val-auc:0.872629
[208]	train-auc:0.9928	val-auc:0.869919
[209]	train-auc:0.992876	val-auc:0.869919
[210]	train-auc:0.992926	val-auc:0.869919
[211]	train-auc:0.993028	val-auc:0.869919
[212]	train-auc:0.993088	val-auc:0.869919
[213]	train-auc:0.993121	val-auc:0.872629
[214]	train-auc:0.993231	val-auc:0.875339
[215]	train-auc:0.993278	val-auc:0.872629
[216]	train-auc:0.993355	val-auc:0.872629
[217]	train-auc:0.993455	val-auc:0.872629
[218]	train-auc:0.99351	val-auc:0.872

In [12]:
pred=xgb_model.predict(xgb_test)
res =pd.DataFrame({'uid':test.uid,'label':pred})
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

res.to_csv("XGB1.csv", index=False,header=False,sep=',',columns=['uid','label'])