In [8]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.model_selection import train_test_split  

In [56]:
from sklearn.datasets import make_hastie_10_2  # 生成12000行数据
data, target = make_hastie_10_2()
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=123)
X_train.shape, X_test.shape
y_train # y中有负值不能应用在xgboost中
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)  # 解决y的取值问题

In [21]:
# 对比模型
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import time

In [22]:
clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = AdaBoostClassifier()
clf4 = GradientBoostingClassifier()
clf5 = XGBClassifier()


for clf, label in zip([clf1, clf2, clf3, clf4, clf5], [
        'Logistic Regression', 'Random Forest', 'AdaBoost', 'GBDT', 'XGBoost'
]):
    start = time.time()
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    end = time.time()
    running_time = end - start
    print("Accuracy: %0.8f (+/- %0.2f),耗时%0.2f秒。模型名称[%s]" %
          (scores.mean(), scores.std(), running_time, label))

Accuracy: 0.48977778 (+/- 0.00),耗时0.06秒。模型名称[Logistic Regression]
Accuracy: 0.88555556 (+/- 0.00),耗时9.17秒。模型名称[Random Forest]
Accuracy: 0.87655556 (+/- 0.00),耗时1.75秒。模型名称[AdaBoost]
Accuracy: 0.91477778 (+/- 0.00),耗时7.77秒。模型名称[GBDT]
Accuracy: 0.92866667 (+/- 0.00),耗时5.85秒。模型名称[XGBoost]


In [23]:
# 使用xgboost
import xgboost as xgb
import time

start_time = time.time()
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)
params = {
    'booster': 'gbtree',
    'eta': 0.007,    # lr
    'min_child_weight': 3,
    'max_depth': 6,  
    'gamma': 0.1,  
    'subsample': 0.7,  
    'colsample_bytree': 0.7,  
    'lambda': 2, 
    'seed': 1000,  
}
plst = list(params.items())
num_rounds = 500  
watchlist = [(xgb_train, 'train'), (xgb_test, 'val')]

In [24]:
# 进行模型训练
model = xgb.train(
    plst,
    xgb_train,
    num_rounds,
    watchlist,
    early_stopping_rounds=100,
)
print("best best_ntree_limit", model.best_ntree_limit)
y_pred = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)
print('error=%f' %
      (sum(1
           for i in range(len(y_pred)) if int(y_pred[i] > 0.5) != y_test[i]) /
       float(len(y_pred))))
cost_time = time.time() - start_time
print("xgboost success!", '\n', "cost time:", cost_time, "(s)......")

[0]	train-rmse:0.49936	val-rmse:1.12260
[1]	train-rmse:0.49871	val-rmse:1.12210
[2]	train-rmse:0.49810	val-rmse:1.12162
[3]	train-rmse:0.49747	val-rmse:1.12109
[4]	train-rmse:0.49681	val-rmse:1.12055
[5]	train-rmse:0.49616	val-rmse:1.12006
[6]	train-rmse:0.49551	val-rmse:1.11958
[7]	train-rmse:0.49488	val-rmse:1.11910
[8]	train-rmse:0.49426	val-rmse:1.11861
[9]	train-rmse:0.49361	val-rmse:1.11808
[10]	train-rmse:0.49299	val-rmse:1.11759
[11]	train-rmse:0.49237	val-rmse:1.11710
[12]	train-rmse:0.49173	val-rmse:1.11662
[13]	train-rmse:0.49111	val-rmse:1.11610
[14]	train-rmse:0.49048	val-rmse:1.11564
[15]	train-rmse:0.48987	val-rmse:1.11514
[16]	train-rmse:0.48924	val-rmse:1.11465
[17]	train-rmse:0.48860	val-rmse:1.11413
[18]	train-rmse:0.48801	val-rmse:1.11366
[19]	train-rmse:0.48742	val-rmse:1.11317
[20]	train-rmse:0.48682	val-rmse:1.11269
[21]	train-rmse:0.48626	val-rmse:1.11223
[22]	train-rmse:0.48567	val-rmse:1.11178
[23]	train-rmse:0.48508	val-rmse:1.11132
[24]	train-rmse:0.48449	va

[198]	train-rmse:0.40051	val-rmse:1.04360
[199]	train-rmse:0.40011	val-rmse:1.04327
[200]	train-rmse:0.39971	val-rmse:1.04292
[201]	train-rmse:0.39930	val-rmse:1.04263
[202]	train-rmse:0.39892	val-rmse:1.04231
[203]	train-rmse:0.39854	val-rmse:1.04198
[204]	train-rmse:0.39819	val-rmse:1.04169
[205]	train-rmse:0.39780	val-rmse:1.04135
[206]	train-rmse:0.39741	val-rmse:1.04103
[207]	train-rmse:0.39702	val-rmse:1.04069
[208]	train-rmse:0.39664	val-rmse:1.04036
[209]	train-rmse:0.39624	val-rmse:1.04005
[210]	train-rmse:0.39583	val-rmse:1.03972
[211]	train-rmse:0.39541	val-rmse:1.03939
[212]	train-rmse:0.39505	val-rmse:1.03909
[213]	train-rmse:0.39467	val-rmse:1.03878
[214]	train-rmse:0.39432	val-rmse:1.03850
[215]	train-rmse:0.39395	val-rmse:1.03818
[216]	train-rmse:0.39358	val-rmse:1.03786
[217]	train-rmse:0.39317	val-rmse:1.03754
[218]	train-rmse:0.39281	val-rmse:1.03725
[219]	train-rmse:0.39241	val-rmse:1.03693
[220]	train-rmse:0.39205	val-rmse:1.03662
[221]	train-rmse:0.39169	val-rmse:

[394]	train-rmse:0.33809	val-rmse:0.99135
[395]	train-rmse:0.33783	val-rmse:0.99112
[396]	train-rmse:0.33757	val-rmse:0.99088
[397]	train-rmse:0.33731	val-rmse:0.99068
[398]	train-rmse:0.33707	val-rmse:0.99046
[399]	train-rmse:0.33684	val-rmse:0.99025
[400]	train-rmse:0.33658	val-rmse:0.99003
[401]	train-rmse:0.33629	val-rmse:0.98981
[402]	train-rmse:0.33606	val-rmse:0.98961
[403]	train-rmse:0.33582	val-rmse:0.98940
[404]	train-rmse:0.33556	val-rmse:0.98919
[405]	train-rmse:0.33529	val-rmse:0.98898
[406]	train-rmse:0.33502	val-rmse:0.98877
[407]	train-rmse:0.33476	val-rmse:0.98855
[408]	train-rmse:0.33452	val-rmse:0.98834
[409]	train-rmse:0.33427	val-rmse:0.98813
[410]	train-rmse:0.33400	val-rmse:0.98791
[411]	train-rmse:0.33374	val-rmse:0.98769
[412]	train-rmse:0.33349	val-rmse:0.98750
[413]	train-rmse:0.33324	val-rmse:0.98728
[414]	train-rmse:0.33300	val-rmse:0.98708
[415]	train-rmse:0.33276	val-rmse:0.98687
[416]	train-rmse:0.33251	val-rmse:0.98665
[417]	train-rmse:0.33226	val-rmse:

In [66]:
# 使用sklearn实现xgboost
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier

clf = XGBClassifier(
    learning_rate=0.7,
    min_child_weight=1,
    max_depth=6,
    gamma=0,
    subsample=1,
    max_delta_step=0,
    colsample_bytree=1,
    reg_lambda=1,
    n_estimators=100,
    seed=1000)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test) 
print('Accuracy : %.4g' % metrics.accuracy_score(y_true, y_pred))  # 为什么准确率这么低

Accuracy : 0.478
