In [1]:
from skopt import BayesSearchCV
from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb

In [3]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import numpy as np
# 产生随机分类数据集，10个特征， 2个类别
x, y = make_classification(n_samples=1000,n_features=10,n_classes=2)

In [4]:
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

### 方法1

In [5]:
model_lgb = lgb.LGBMClassifier(
            learning_rate=0.008,   # 学习率
            n_estimators=10000,    # 树的个数
            max_depth=3,         # 树的最大深度
            num_leaves=31,        # 叶子节点个数 'leaf-wise'
            min_split_gain=0.0,     # 节点分裂所需的最小损失函数下降值
            objective='cross_entropy', # 多分类
            metric='auc',  # 评价函数
            #num_class=2,          # 多分类问题类别数
            subsample=0.6,        # 样本随机采样作为训练集的比例
            colsample_bytree=0.6, # 使用特征比例
            seed=1)

In [8]:
# lightgbm scikit-optimize
def lgb_auto_para_tuning_bayesian(model_lgb,X,Y):
    train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.75, random_state=0)
    # cv：交叉验证 n_points：并行采样的超参组数
    opt = BayesSearchCV(model_lgb,cv=3,n_points=2,n_jobs=4,verbose=1,
        search_spaces={
            'learning_rate': Real(0.008, 0.01),
            'max_depth': Integer(3, 11),
            'num_leaves': Integer(31, 127),
            'min_split_gain':Real(0.0,0.4),
            'min_child_weight':Real(0.001,0.002),
            'min_child_samples':Integer(18,22),
            'subsample':Real(0.6,1.0),
            'subsample_freq':Integer(3,5),
            'colsample_bytree':Real(0.6,1.0),
            'reg_alpha':Real(0,0.5),
            'reg_lambda':Real(0,0.5)
        },
         fit_params={
                 'eval_set':[(test_x, test_y)],
                 'eval_metric': 'auc',
                 'early_stopping_rounds': 50
                 })
    opt.fit(train_x,train_y)
    print("val. score: %s" % opt.best_score_)
    print("test score: %s" % opt.score(test_x, test_y))
    print("Best parameters: ", opt.best_params_)

### 产出训练集与测试级结果

In [9]:
lgb_auto_para_tuning_bayesian(model_lgb,x,y
                            )

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Fitting 3 folds for each of 2 candidates, totalling 6 fi



Fitting 3 folds for each of 2 candidates, totalling 6 fits
val. score: 0.8333333333333334
test score: 0.896
Best parameters:  OrderedDict([('colsample_bytree', 0.7799680537921099), ('learning_rate', 0.009661432675686504), ('max_depth', 11), ('min_child_samples', 19), ('min_child_weight', 0.001891216953247175), ('min_split_gain', 0.4), ('num_leaves', 127), ('reg_alpha', 0.03155422222046529), ('reg_lambda', 0.0), ('subsample', 1.0), ('subsample_freq', 5)])


### 测试集得分 0.896

### 最佳参数
Best parameters:  OrderedDict([('colsample_bytree', 0.7799680537921099), ('learning_rate', 0.009661432675686504), ('max_depth', 11), ('min_child_samples', 19), ('min_child_weight', 0.001891216953247175), ('min_split_gain', 0.4), ('num_leaves', 127), ('reg_alpha', 0.03155422222046529), ('reg_lambda', 0.0), ('subsample', 1.0), ('subsample_freq', 5)])

### 方法2

In [11]:
# 产生随机分类数据集，10个特征， 2个类别
#x, y = make_classification(n_samples=1000,n_features=10,n_classes=2)
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.75, random_state=0)

In [12]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import numpy as np

rf = RandomForestClassifier()
print(np.mean(cross_val_score(rf, train_x, train_y, cv=20, scoring='roc_auc')))




0.928281779008926


## 运用贝叶斯寻找4个参数的最优值

In [20]:
def rf_cv(n_estimators, min_samples_split, max_features, max_depth):
    val = cross_val_score(
        RandomForestClassifier(n_estimators=int(n_estimators),
            min_samples_split=int(min_samples_split),
            max_features=min(max_features, 0.999), # float
            max_depth=int(max_depth),
            random_state=2
        ),
        train_x, train_y, scoring='roc_auc', cv=5
    ).mean()
    return val

rf_bo = BayesianOptimization(
        rf_cv,
        {'n_estimators': (10, 250),
        'min_samples_split': (2, 25),
        'max_features': (0.1, 0.999),
        'max_depth': (5, 15)}
    )

rf_bo.maximize()

|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9192  [0m | [0m 10.76   [0m | [0m 0.8269  [0m | [0m 16.34   [0m | [0m 104.5   [0m |
| [0m 2       [0m | [0m 0.9175  [0m | [0m 7.593   [0m | [0m 0.9844  [0m | [0m 23.98   [0m | [0m 202.6   [0m |
| [0m 3       [0m | [0m 0.9175  [0m | [0m 7.106   [0m | [0m 0.9792  [0m | [0m 14.86   [0m | [0m 129.6   [0m |
| [0m 4       [0m | [0m 0.9032  [0m | [0m 10.19   [0m | [0m 0.1173  [0m | [0m 15.75   [0m | [0m 153.5   [0m |
| [0m 5       [0m | [0m 0.9171  [0m | [0m 14.05   [0m | [0m 0.6315  [0m | [0m 11.11   [0m | [0m 36.45   [0m |
| [95m 6       [0m | [95m 0.921   [0m | [95m 10.05   [0m | [95m 0.6445  [0m | [95m 15.06   [0m | [95m 104.8   [0m |
| [0m 7       [0m | [0m 0.9007  [0m | [0m 7.82    [0m | [0m 0.1188  [0m | [0m 7.908   [0m | [0m 107.7  

### 最优结果对应的参数

In [21]:
rf_bo.max

{'target': 0.9210210589536905,
 'params': {'max_depth': 10.04601563059487,
  'max_features': 0.6445073144692921,
  'min_samples_split': 15.06326947843053,
  'n_estimators': 104.80329098764881}}

### 重新将参数带入模型

In [24]:
clf_new=RandomForestClassifier(n_estimators=int(104.80329098764881),
            min_samples_split=int(15.06326947843053),
            max_features=min(0.6445073144692921, 0.999), # float
            max_depth=int(10.04601563059487),
            random_state=2)

### 训练数据

In [28]:
clf_new.fit(train_x, train_y)

RandomForestClassifier(max_depth=10, max_features=0.6445073144692921,
                       min_samples_split=15, n_estimators=104, random_state=2)

### 用测试集计算auc

In [30]:
pred=clf_new.predict(test_x)

In [31]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(test_y, pred)
metrics.auc(fpr, tpr)

0.8897435897435898

### 最终方法1效果更好