## 机器学习参数调优
```
机器学习网络超参数自动化搜索，主要有以下三种方式：
1. 随机参数调优：在参数中随机组合，缺点是不一定能找到最优参数
2. 网格搜索：将所有参数组合的模型训练一遍，缺点：训练时间长
3. 贝叶斯优化：通过猜测目标优化函数来求解最大值，和网格搜索相比，优点是迭代次数少，粒度小，缺点是不容易找到全局最优解
```
### 贝叶斯优化
```
贝叶斯优化思想概述：在不知道函数分布的情况下，进行多轮迭代，每轮迭代操作：在样本空间中随机采样，计算函数值，并以此粗略估计函数分布，下一次迭代会在上一次迭代的基础上选择合适的采样点，然后计算函数值，并更新函数分布。经过多轮迭代，最终可以确定大致的模型分布，随后可以在采样空间中确定达到目标的最优解。

贝叶斯优化的过程中还会做勘探采样，以防把局部最优当成全局最优。
```

## 数据导入

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
%matplotlib inline

# Read in data into dataframes 
train_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/testing_features.csv')
train_labels = pd.read_csv('data/training_labels.csv')
test_labels = pd.read_csv('data/testing_labels.csv')

# Display sizes of data
print('Training Feature Size: ', train_features.shape)
print('Testing Feature Size:  ', test_features.shape)
print('Training Labels Size:  ', train_labels.shape)
print('Testing Labels Size:   ', test_labels.shape)

Training Feature Size:  (6622, 64)
Testing Feature Size:   (2839, 64)
Training Labels Size:   (6622, 1)
Testing Labels Size:    (2839, 1)


## 缺失值填充

In [2]:
# Create an imputer object with a median filling strategy
from sklearn.preprocessing import Imputer, MinMaxScaler
imputer = Imputer(strategy='median')

# Train on the training features
imputer.fit(train_features)

# Transform both training data and testing data
X = imputer.transform(train_features)
X_test = imputer.transform(test_features)

print('Missing values in training features: ', np.sum(np.isnan(X)))
print('Missing values in testing features:  ', np.sum(np.isnan(X_test)))

# Make sure all values are finite
print(np.where(~np.isfinite(X)))
print(np.where(~np.isfinite(X_test)))

Missing values in training features:  0
Missing values in testing features:   0
(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))




## 数据标准化-归一化

In [3]:
# Create the scaler object with a range of 0-1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit on the training data
scaler.fit(X)

# Transform both the training and testing data
X = scaler.transform(X)
X_test = scaler.transform(X_test)

# Convert y to one-dimensional array (vector)
y = np.array(train_labels).reshape((-1, ))
y_test = np.array(test_labels).reshape((-1, ))

## 定义参数空间
使用GBDT模型

In [4]:
# Loss function to be optimized
loss = ['ls', 'lad', 'huber']

# Number of trees used in the boosting process
n_estimators = [100, 500, 900, 1100, 1500]

# Maximum depth of each tree
max_depth = [2, 3, 5, 10, 15]

# Minimum number of samples per leaf
min_samples_leaf = [1, 2, 4, 6, 8]

# Minimum number of samples to split a node
min_samples_split = [2, 4, 6, 10]

# Maximum number of features to consider for making splits
max_features = ['auto', 'sqrt', 'log2', None]

# Define the grid of hyperparameters to search
hyperparameter_grid = {'loss': loss,
                       'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split,
                       'max_features': max_features}

## 创建GBDT模型

In [5]:
# Create the model to use for hyperparameter tuning
model = GradientBoostingRegressor(random_state = 42)

## 定义评估模型

In [6]:
# Function to calculate mean absolute error
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

# Takes in a model, trains the model, and evaluates the model on the test set
def fit_and_evaluate(model):
    
    # Train the model
    model.fit(X, y)
    
    # Make predictions and evalute
    model_pred = model.predict(X_test)
    model_mae = mae(y_test, model_pred)
    
    # Return the performance metric
    return model_mae

## 使用随机参数选择

RandomizedSearchCV

In [7]:
# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=model,
                               param_distributions=hyperparameter_grid,
                               cv=4, n_iter=25, 
                               scoring = 'neg_mean_absolute_error',
                               n_jobs = -1, verbose = 1, 
                               return_train_score = True,
                               random_state=42)

In [18]:
# 模型训练
random_cv.fit(X, y)

Fitting 4 folds for each of 25 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished


RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                            

In [19]:
# 获取最优参数组合的模型
random_cv.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=6, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

## 使用网格搜索
GridSearchCV

In [20]:
# Grid Search Object using the trees range and the random forest model
grid_search = GridSearchCV(estimator = model, param_grid=hyperparameter_grid, cv = 4, 
                           scoring = 'neg_mean_absolute_error', verbose = 1,
                           n_jobs = -1, return_train_score = True)

In [None]:
# Fit the grid search
grid_search.fit(X, y)

Fitting 4 folds for each of 6000 candidates, totalling 24000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed: 45.1min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed: 60.8min
[Parallel(n_jobs=-1)]: Done 4026 tasks      | elapsed: 91.4min
[Parallel(n_jobs=-1)]: Done 4976 tasks      | elapsed: 136.1min


In [None]:
# 获取最优参数组合的模型
random_cv.best_estimator_

## 使用贝叶斯优化

### 贝叶斯优化需要的四个主要部分
-  1. Objective 目标函数
-  2. Domain space: 指定参数空间
-  3. Hyperparameter optimization function: 可以选择的采样算法，随机或者贝叶斯优化
-  4. Trials: 记录结果的保存

tpe_best = fmin(fn=objective, space=space, algo=tpe_algo, trials=tpe_trials, 
                max_evals=2000, rstate= np.random.RandomState(50))

#### bayes-01.定义目标函数

In [16]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    # Perform n_folds cross validation
    model = GradientBoostingRegressor(**params,random_state = 42)
    
    start = timer()
    
    model_mae = fit_and_evaluate(model)
   
    run_time = timer() - start
    
    # Loss must be minimized
    loss = model_mae
    

    # Write to the csv file ('a' means append)
    of_connection = open(out_file, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION, 
            'train_time': run_time, 'status': STATUS_OK}

#### bayes-02.参数空间定义

In [20]:
# Define the search space

from hyperopt import hp
space = {
    'loss': hp.choice('loss', ['ls', 'lad', 'huber']),
    'n_estimators': hp.choice('n_estimators', [100, 500, 900, 1100, 1500]),
    'max_depth': hp.quniform('max_depth', 2, 15, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'min_samples_split': hp.choice('min_samples_split', [2, 4, 6, 10]),
    'max_features': hp.choice('max_features',['auto', 'sqrt', 'log2', None])
}

#### bayes-03.选择贝叶斯优化方式

In [8]:
from hyperopt import tpe

# optimization algorithm
tpe_algorithm = tpe.suggest

#### bayes-04.选择记录保存

In [9]:
from hyperopt import Trials
# Keep track of results
bayes_trials = Trials()

#### bayes-05.训练&&最优参数选择

In [21]:
from hyperopt import fmin
MAX_EVALS = 500

global  ITERATION
ITERATION = 0
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest, 
            max_evals = MAX_EVALS, trials = bayes_trials, rstate = np.random.RandomState(50))

  0%|                                                                            | 0/500 [00:00<?, ?it/s, best loss: ?]


NameError: name 'start' is not defined

#### bayes-06.最优结果统计

In [None]:
# 可以将字典值保存于csv文件中
bayes_trials_results = sorted(bayes_trials.results, key = lambda x: x['loss'])
bayes_trials_results[:2]

# 创建csv文件保存最优数据
out_file = 'results/gbm_trials.csv'
of_connection = open(out_file, 'w')
writer = csv.writer(of_connection)

writer.writerow(['loss', 'params', 'iteration', 'train_time'])
of_connection.close()



results = pd.read_csv('results/gbm_trials.csv')
results.sort_values('loss', ascending = True, inplace = True)
results.reset_index(inplace = True, drop = True)
results.head()