导入相关包及原生XGBoost

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score

加载鸢尾花数据

In [10]:
iris = datasets.load_iris()
data = iris.data
label = iris.target
print(pd.DataFrame(data).head())
print(pd.DataFrame(label).head())
data1 = pd.DataFrame(data)
label1 = pd.DataFrame(label)
# 特征重命名
data1.columns = ['sepal_l','sepal_w','petal_l','petal_w']
label1.columns = ['label']
print(data1.head())
print(label1.head())

     0    1    2    3
0  5.1  3.5  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  1.3  0.2
3  4.6  3.1  1.5  0.2
4  5.0  3.6  1.4  0.2
   0
0  0
1  0
2  0
3  0
4  0
   sepal_l  sepal_w  petal_l  petal_w
0      5.1      3.5      1.4      0.2
1      4.9      3.0      1.4      0.2
2      4.7      3.2      1.3      0.2
3      4.6      3.1      1.5      0.2
4      5.0      3.6      1.4      0.2
   label
0      0
1      0
2      0
3      0
4      0


划分数据集

In [11]:
train_x, test_x, train_y, test_y = train_test_split(data1.values, label1.values, 
                                                    test_size=0.3, random_state = 42)
print('training length: ',len(train_x))
print('testing length:',len(test_x))

training length:  105
testing length: 45


建模

In [15]:
# XGBoost要求数据符合特定矩阵形态
train_data = xgb.DMatrix(train_x,label=train_y)
test_data = xgb.DMatrix(test_x,label=test_y)

# 设置参数
# multi:softmax是使用softmax后产生的分类结果, multi:softprob是输出的概率矩阵
xgb_params = {
    'eta': 0.3, 'silent': True, 'objective': 'multi:softprob',
    'num_class': 3, 'max_depth': 3
}

num_round = 20

# 训练模型
# 训练数据可以用函数传入
xgb_model_1 = xgb.train(xgb_params,xgb.DMatrix(train_x, label=train_y),num_round)

# 模型预测
test_pre = xgb_model_1.predict(test_data)

# 查看概率矩阵
print(test_pre[:5])

# 去概率最大的那一列作为预测值（0,1,2）
test_pre_1 = np.asanyarray([np.argmax(row) for row in test_pre])
print('\n')
print('test的预测结果',test_pre_1)

# 模型评估
print('精确率',precision_score(test_y,test_pre_1,average='macro'))
print('召回率',recall_score(test_y,test_pre_1,average='macro'))

[[0.00650657 0.96226174 0.03123166]
 [0.970643   0.02533227 0.00402478]
 [0.0033913  0.00692109 0.9896876 ]
 [0.00654362 0.9677424  0.02571394]
 [0.00615641 0.9104776  0.08336602]]


test的预测结果 [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
精确率 1.0
召回率 1.0


Sklearn的方法使用XGBoost

In [6]:
from xgboost import XGBClassifier

xgb_model_2 = XGBClassifier(
              learning_rate=0.01,
              n_estimators=3000,
              max_depth=4,
              objective='binary:logistic',
              seed=27)

xgb_model_2.fit(train_x,train_y)

# 预测
test_pre_2 = xgb_model_2.predict(test_x)
print(test_pre_2)

# 模型评估
print('精确率',precision_score(test_y,test_pre_2,average='macro'))
print('召回率',recall_score(test_y,test_pre_2,average='macro'))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
精确率 1.0
召回率 1.0


原生LightGBM

In [17]:
import lightgbm as lgb

注意这里不需要values!

In [21]:
train_x, test_x, train_y, test_y = train_test_split(data1, label1, 
                                                    test_size=0.3, random_state = 42)
print('training length: ',len(train_x))
print('testing length:',len(test_x))

training length:  105
testing length: 45


In [22]:
# 数据格式
train_data = lgb.Dataset(train_x,train_y)
test_data = lgb.Dataset(test_x,test_y)

# 模型参数
lgb_params = {
    'boosting_type': 'gbdt', 'objective': 'multiclass',
    'metric': 'multi_error', 'verbose': 1, 'num_class': 3
}

# 模型训练
lgb_model_1 = lgb.train(lgb_params,train_data,num_boost_round=10,
                       valid_sets=[train_data,test_data],verbose_eval=10)

# 模型预测
test_pre = lgb_model_1.predict(test_x, num_iteration=lgb_model_1.best_iteration)
print(test_pre[:5])

# 最高概率预测
test_pre_1 = np.asanyarray([np.argmax(row) for row in test_pre])
print('\n')
print('预测结果',test_pre_1)

# 模型评估
print('精确率',precision_score(test_y,test_pre_1,average='macro'))
print('召回率',recall_score(test_y,test_pre_2,average='macro'))

[10]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[[0.13683286 0.63500393 0.22816321]
 [0.69436834 0.15467706 0.15095461]
 [0.12934308 0.16125127 0.70940565]
 [0.14172417 0.62195656 0.23631927]
 [0.13683286 0.63500393 0.22816321]]


预测结果 [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
精确率 1.0
召回率 1.0


Sklearn接口形式

In [23]:
lgb_params = {
    'learning_rate':0.1,
    'max_bin':150,
    'num_leaves':32,
    'max_depth':11,
    'objective':'multiclass',
    'n_estimators':300
}

lgb_model_2 = lgb.LGBMClassifier(**lgb_params)

lgb_model_2.fit(train_x,train_y)

# 模型预测
test_pre_2 = lgb_model_2.predict(test_x)
print(test_pre_2)

# 模型评估
print('精确率',precision_score(test_y,test_pre_2,average='macro'))
print('召回率',recall_score(test_y,test_pre_2,average='macro'))

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
精确率 1.0
召回率 1.0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [24]:
?lgb.train

总结：
1. lgb.train中正则化参数为"lambda_l1", "lambda_l1"，sklearn中则为'reg_alpha', 'reg_lambda'。
2. 多分类时lgb.train除了'objective':'multiclass',还要指定"num_class":5，而sklearn接口只需要指定'objective':'multiclass'。
3. 迭代次数在sklearn中是'n_estimators':20，在初始化模型时指定