In [None]:
from control import lqr
import numpy as np
from matplotlib import pyplot as plt
import os, re
import pandas as pd
from matplotlib import font_manager  #matplotlib中 中文设置模块

In [None]:
## 输出高清图像
% config InlineBackend.figure_format = 'retina'
% matplotlib inline
import platform
# 图像显示中文的问题，需要判断系统是windows还是苹果的
import matplotlib
import platform

sys_platform = platform.platform().lower()
if "windows" in sys_platform:
    font = {
        "family": "Times New Roman"
    }
    matplotlib.rc("font", **font)
else:
    font = {
        "family": "Arial Unicode MS"
    }
    matplotlib.rc("font", **font)
rc = {"mathtext.fontset": "stix", }

plt.rcParams.update(rc)

# 提取数据

## Libs

In [None]:
import numpy as np
import pandas as pd
import os

In [None]:
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('./data/opensees_design_res.csv')
df.info()
df['ok'] = True
df.loc[df["drift_max"] > 1 / 200, ['ok']] = False
df.loc[df["a_max"] > 0.35 * 9.8, ['ok']] = False
df.sample(3)
df.info

In [None]:
df["ok"].value_counts()

In [None]:
df.describe()

In [None]:
# 特征排序
def plot_features_importances(feature_importances, feature_names):
    """
    特征重要性排序，选出占重要性排序前90%的特征
    :param feature_importances:  特征重要性
    :param feature_names: 特征名称
    :return: 主要的特征
    """
    plt.figure(figsize=(5, 20))
    indices = np.argsort(feature_importances)  # 下标排序
    indices_flip = indices[::-1]  # 倒序
    names = []
    for f in range(len(feature_names)):
        names.append(feature_names[indices[f]])
        print("%2d) %-*s %f" % (f + 1, 30, feature_names[indices_flip[f]], feature_importances[indices_flip[f]]))

    # 取前90%重要性的数据
    sum_importances = 0
    threshold = 0
    for i in range(len(feature_importances)):
        sum_importances += feature_importances[indices[i]]
        if sum_importances >= 0.05:
            threshold = i
            break

    pos = np.arange(indices.shape[0])
    plt.barh(pos[0:threshold], feature_importances[indices[0:threshold]], align="center")
    plt.barh(pos[threshold:], feature_importances[indices[threshold:]], align="center", color="red")
    plt.ylim([pos[0] - 1, pos[-1] + 1])
    plt.ylabel("排序", fontname='SimSun', fontsize=10)
    plt.xlabel("相对重要性", fontname='SimSun', fontsize=10)
    plt.yticks(pos, names, fontsize=5)
    plt.xticks(fontsize=5)
    xlabel = feature_importances[indices]
    ylabel = pos
    for x1, y1 in zip(xlabel, ylabel):
        x1 = np.around(x1, decimals=3)
        plt.text(x1 + 0.0005, y1 - 0.25, '%.3f' % x1, fontsize=5)
    plt.show()
    return names[threshold:][::-1]

# 分类问题

# Libs

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
# 利用GridSearchCV选择最优参数
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.metrics import plot_roc_curve
from sklearn.linear_model import LogisticRegression

from mlxtend.classifier import StackingClassifier

from collections import Counter

## 特征过程

In [None]:
# 去掉id,site_name,miu,zeta,kappa,gamma,alpha,
x_label = ["m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m9", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9",
           "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "sa1", "sa2", "sa3", "sa4", "sa5", "sa6", "sa7", "sa8",
           "sa9", "sa10", "sa11", "sa12", "sa13", "sa14", "sa15", "sa16", "sa17", "sa18", "sa19", "sa20", "sa21",
           "sa22", "sa23", "sa24", "sa25", "sa26", "sa27", "sa28", "sa29", "sa30", "sv1", "sv2", "sv3", "sv4", "sv5",
           "sv6", "sv7", "sv8", "sv9", "sv10", "sv11", "sv12", "sv13", "sv14", "sv15", "sv16", "sv17", "sv18", "sv19",
           "sv20", "sv21", "sv22", "sv23", "sv24", "sv25", "sv26", "sv27", "sv28", "sv29", "sv30", "sd1", "sd2", "sd3",
           "sd4", "sd5", "sd6", "sd7", "sd8", "sd9", "sd10", "sd11", "sd12", "sd13", "sd14", "sd15", "sd16", "sd17",
           "sd18", "sd19", "sd20", "sd21", "sd22", "sd23", "sd24", "sd25", "sd26", "sd27", "sd28", "sd29", "sd30",
           "sa_max", "sa_avg", "sv_max", "sv_avg", "sd_max", "sd_avg", "pga", "pgv", "pgd", "epa", "epv", "epd", "pa",
           "pv", "pd", "ic"]
y_label = ["ok"]
X = df.loc[:, x_label].values
y = df.loc[:, y_label].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)
# y_test = y_test.reshape([-1,])

In [None]:
# shuffle_index = np.random.permutation(934)
# X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
new_X_train = X_train
new_X_test = X_test

# normalizer = Normalizer(copy=True, norm='l2').fit(new_X_train)
ss = StandardScaler().fit(new_X_train)
new_X_train = ss.transform(new_X_train)
new_X_test = ss.transform(new_X_test)

In [None]:
new_X_test

## 随机森林

In [None]:
# 贝叶斯优化
from skopt import BayesSearchCV

rfc = RandomForestClassifier()

param_grid = {'max_depth': np.linspace(start=10, stop=500, num=50, dtype=int),
              'n_estimators': np.linspace(start=50, stop=3000, num=60, dtype=int),
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': np.array([2, 5, 10, 15], dtype=int),
              'min_samples_split': np.array([2, 5, 10, 15], dtype=int),
              'random_state': [3]}

opt = BayesSearchCV(rfc, param_grid, n_iter=50, n_jobs=3, cv=3)

opt.fit(new_X_train, y_train)

In [None]:
print(opt.best_params_)
best_params = opt.best_params_

In [None]:
rfc = RandomForestClassifier(n_estimators=best_params["n_estimators"],
                             criterion=best_params['criterion'],
                             max_depth=best_params["max_depth"], min_samples_split=best_params["min_samples_split"],
                             min_samples_leaf=best_params["min_samples_leaf"], random_state=best_params["random_state"])
rfc.fit(new_X_train, y_train)
y_pre_rfc = rfc.predict(new_X_test)

print('正确标签：', y_test)
print('预测结果：', y_pre_rfc)

print('训练集分数：', rfc.score(new_X_train, y_train))
print('测试集分数：', rfc.score(new_X_test, y_test))
# 混淆矩阵
conf_mat = confusion_matrix(y_test, y_pre_rfc)
plt.matshow(conf_mat, cmap=plt.cm.Blues)
for (i, j), z in np.ndenumerate(conf_mat):
    plt.text(j, i, '{0:d}'.format(z), ha='center', va='center')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print('混淆矩阵：')
print(conf_mat)

print('混淆矩阵：')
print(conf_mat)

# 分类指标文本报告（精确率、召回率、F1值等）
print('分类指标报告：')
print(classification_report(y_test, y_pre_rfc))

# 特征重要性
print(rfc.feature_importances_)

# 画图展示训练结果
fig = plt.figure()
ax = fig.add_subplot(111)
f2 = ax.scatter(list(range(len(X_test))), y_pre_rfc, marker='o')
f1 = ax.scatter(list(range(len(X_test))), y_test, marker='*')
plt.legend(handles=[f1, f2], labels=['True', 'Prediction'])
plt.show()

In [None]:
main_features = plot_features_importances(rfc.feature_importances_, x_label)

In [None]:
main_features

In [None]:
# 使用主特征
X_main = df.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)
new_X_main_test

In [None]:
rfc = RandomForestClassifier(n_estimators=best_params["n_estimators"],
                             criterion=best_params['criterion'],
                             max_depth=best_params["max_depth"], min_samples_split=best_params["min_samples_split"],
                             min_samples_leaf=best_params["min_samples_leaf"], random_state=best_params["random_state"])
rfc.fit(new_X_main_train, y_train)
y_pre_rfc = rfc.predict(new_X_main_test)

print('正确标签：', y_test)
print('预测结果：', y_pre_rfc)

print('训练集分数：', rfc.score(new_X_main_train, y_train))
print('测试集分数：', rfc.score(new_X_main_test, y_test))
# 混淆矩阵
conf_mat = confusion_matrix(y_test, y_pre_rfc)
plt.matshow(conf_mat, cmap=plt.cm.Blues)
for (i, j), z in np.ndenumerate(conf_mat):
    plt.text(j, i, '{0:d}'.format(z), ha='center', va='center')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print('混淆矩阵：')
print(conf_mat)

print('混淆矩阵：')
print(conf_mat)

# 分类指标文本报告（精确率、召回率、F1值等）
print('分类指标报告：')
print(classification_report(y_test, y_pre_rfc))

# 特征重要性
print(rfc.feature_importances_)

# 画图展示训练结果
fig = plt.figure()
ax = fig.add_subplot(111)
f2 = ax.scatter(list(range(len(X_test))), y_pre_rfc, marker='o')
f1 = ax.scatter(list(range(len(X_test))), y_test, marker='*')
plt.legend(handles=[f1, f2], labels=['True', 'Prediction'])
plt.show()

## XGBoost

In [None]:
# 贝叶斯优化
from skopt import BayesSearchCV

# 贝叶斯优化
xgBoostC = XGBClassifier()
random_state = 3

param_grid = {'learning_rate': np.array([0.01, 0.015, 0.025, 0.05, 0.1], dtype=float),
              'gamma': np.array([0, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1], dtype=float),
              "reg_alpha": np.array([0, 0.01, 0.1, 1], dtype=float),
              "reg_lambda": np.array([0, 0.1, 0.5, 1], dtype=float),
              "min_child_weight": np.array([1, 3, 5, 7], dtype=int),
              "colsample_bytree": np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1], dtype=float),
              'subsample': np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1], dtype=float),
              'max_depth': np.array([3, 5, 8, 15, 25, 30], dtype=int),
              'n_estimators': np.linspace(start=50, stop=3000, num=60, dtype=int),
              'random_state': [random_state]}

opt = BayesSearchCV(xgBoostC, param_grid, n_iter=50, n_jobs=3, cv=3)

opt.fit(new_X_train, y_train)

In [None]:
print(opt.best_params_)
best_params = opt.best_params_

In [None]:
xgBoostC = XGBClassifier(n_estimators=best_params["n_estimators"],
                         colsample_bytree=best_params['colsample_bytree'],
                         max_depth=best_params["max_depth"],
                         learning_rate=best_params["learning_rate"],
                         gamma=best_params["gamma"],
                         min_child_weight=best_params["min_child_weight"],
                         reg_alpha=best_params["reg_alpha"],
                         reg_lambda=best_params["reg_lambda"],
                         subsample=best_params["subsample"],
                         random_state=best_params["random_state"])
xgBoostC.fit(new_X_train, y_train)
y_pre_xgBoostC = xgBoostC.predict(new_X_test)

print('正确标签：', y_test)
print('预测结果：', y_pre_xgBoostC)

print('训练集分数：', xgBoostC.score(new_X_train, y_train))
print('测试集分数：', xgBoostC.score(new_X_test, y_test))
# 混淆矩阵
conf_mat = confusion_matrix(y_test, y_pre_xgBoostC)
plt.matshow(conf_mat, cmap=plt.cm.Blues)
for (i, j), z in np.ndenumerate(conf_mat):
    plt.text(j, i, '{0:d}'.format(z), ha='center', va='center')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print('混淆矩阵：')
print(conf_mat)

print('混淆矩阵：')
print(conf_mat)

# 分类指标文本报告（精确率、召回率、F1值等）
print('分类指标报告：')
print(classification_report(y_test, y_pre_xgBoostC))

# 特征重要性
print(xgBoostC.feature_importances_)

# 画图展示训练结果
fig = plt.figure()
ax = fig.add_subplot(111)
f2 = ax.scatter(list(range(len(X_test))), y_pre_xgBoostC, marker='o')
f1 = ax.scatter(list(range(len(X_test))), y_test, marker='*')
plt.legend(handles=[f1, f2], labels=['True', 'Prediction'])
plt.show()

In [None]:
main_features = plot_features_importances(xgBoostC.feature_importances_, x_label)

In [None]:
main_features

In [None]:
# 使用主特征
X_main = df.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main
new_X_main_test = X_main_train

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)
new_X_main_test

In [None]:
xgBoostC = XGBClassifier(n_estimators=best_params["n_estimators"],
                         colsample_bytree=best_params['colsample_bytree'],
                         max_depth=best_params["max_depth"],
                         learning_rate=best_params["learning_rate"],
                         gamma=best_params["gamma"],
                         min_child_weight=best_params["min_child_weight"],
                         reg_alpha=best_params["reg_alpha"],
                         reg_lambda=best_params["reg_lambda"],
                         subsample=best_params["subsample"],
                         random_state=best_params["random_state"])
xgBoostC.fit(new_X_main_train, y_train)
y_pre_xgBoostC = xgBoostC.predict(new_X_main_test)

print('正确标签：', y_test)
print('预测结果：', y_pre_xgBoostC)

print('训练集分数：', xgBoostC.score(new_X_main_train, y_train))
print('测试集分数：', xgBoostC.score(new_X_main_test, y_test))
# 混淆矩阵
conf_mat = confusion_matrix(y_test, y_pre_xgBoostC)
plt.matshow(conf_mat, cmap=plt.cm.Blues)
for (i, j), z in np.ndenumerate(conf_mat):
    plt.text(j, i, '{0:d}'.format(z), ha='center', va='center')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
print('混淆矩阵：')
print(conf_mat)

print('混淆矩阵：')
print(conf_mat)

# 分类指标文本报告（精确率、召回率、F1值等）
print('分类指标报告：')
print(classification_report(y_test, y_pre_xgBoostC))

# 特征重要性
print(xgBoostC.feature_importances_)

# 画图展示训练结果
fig = plt.figure()
ax = fig.add_subplot(111)
f2 = ax.scatter(list(range(len(X_test))), y_pre_xgBoostC, marker='o')
f1 = ax.scatter(list(range(len(X_test))), y_test, marker='*')
plt.legend(handles=[f1, f2], labels=['True', 'Prediction'])
plt.show()

## 绘制ROC曲线

In [None]:
## 输出高清图像
% config InlineBackend.figure_format = 'retina'
% matplotlib inline
import platform
## 图像显示中文的问题，需要判断系统是windows还是苹果的
import matplotlib
import platform

sys_platform = platform.platform().lower()
if "windows" in sys_platform:
    font = {
        "family": "Microsoft YaHei"
    }
    matplotlib.rc("font", **font)
else:
    font = {
        "family": "Arial Unicode MS"
    }
    matplotlib.rc("font", **font)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# 可视化在验证集上的Roc曲线
plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_roc_curve(rfc, new_X_test, y_test, ax=ax)
plot_roc_curve(xgBoostC, new_X_test, y_test, ax=ax)
# plot_roc_curve(knc, new_X_test, y_test, ax=ax)
# stacking模型
# fpr, tpr, thresholds = roc_curve(y_test, stac.decision_function(new_X_test))
# stack_auc = roc_auc_score(y_test, stac.decision_function(new_X_test))
# plt.plot(fpr, tpr, label="Stacking(AUC={:.2f})".format(stack_auc))
plt.rcParams['font.sans-serif'] = ['Times New Roman']
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.rcParams['axes.unicode_minus'] = False
ax.plot([0, 1], [0, 1], 'k--')
plt.xlabel("假正率", fontname='SimSun', fontsize=10)
plt.ylabel("真正率", fontname='SimSun', fontsize=10)
plt.xlim(0, 1)
plt.ylim(0, 1)
# plt.title("ROC曲线", fontname='SimSun', fontsize=10)
plt.legend()
plt.show()

## 保存训练好的模型

# 回归问题

## Libs

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from mlxtend.regressor import StackingRegressor
from xgboost import XGBRegressor

## 特征过程

In [None]:
x_label = ["m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m9", "k1", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9",
           "c1", "c2", "c3", "c4", "c5", "c6", "c7", "c8", "c9", "sa1", "sa2", "sa3", "sa4", "sa5", "sa6", "sa7", "sa8",
           "sa9", "sa10", "sa11", "sa12", "sa13", "sa14", "sa15", "sa16", "sa17", "sa18", "sa19", "sa20", "sa21",
           "sa22", "sa23", "sa24", "sa25", "sa26", "sa27", "sa28", "sa29", "sa30", "sv1", "sv2", "sv3", "sv4", "sv5",
           "sv6", "sv7", "sv8", "sv9", "sv10", "sv11", "sv12", "sv13", "sv14", "sv15", "sv16", "sv17", "sv18", "sv19",
           "sv20", "sv21", "sv22", "sv23", "sv24", "sv25", "sv26", "sv27", "sv28", "sv29", "sv30", "sd1", "sd2", "sd3",
           "sd4", "sd5", "sd6", "sd7", "sd8", "sd9", "sd10", "sd11", "sd12", "sd13", "sd14", "sd15", "sd16", "sd17",
           "sd18", "sd19", "sd20", "sd21", "sd22", "sd23", "sd24", "sd25", "sd26", "sd27", "sd28", "sd29", "sd30",
           "sa_max", "sa_avg", "sv_max", "sv_avg", "sd_max", "sd_avg", "pga", "pgv", "pgd", "epa", "epv", "epd", "pa",
           "pv", "pd", "ic"]

y_label = ["drift_max", "drift_avg", "a_max", "a_avg", "T_1"]

df_ok_origin = df[df.ok == True]
df_ok = df_ok_origin.copy()
df_ok.describe()
X = df_ok.loc[:, x_label].values
y = df_ok.loc[:, y_label].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2, shuffle=True)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
y_train_theta_max = y_train[:, 0]
y_test_theta_max = y_test[:, 0]
y_train_theta_avg = y_train[:, 1]
y_test_theta_avg = y_test[:, 1]
y_train_a_max = y_train[:, 2]
y_test_a_max = y_test[:, 2]
y_train_a_avg = y_train[:, 3]
y_test_a_avg = y_test[:, 3]
y_train_T1 = y_train[:, 4]
y_test_T1 = y_test[:, 4]

In [None]:
# 特征标准化，采用最大最小值标准化，转化后的值范围（0,1）
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
new_X_train = X_train
new_X_test = X_test
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

# normalizer = Normalizer(copy=True, norm='l2').fit(new_X_train)
# new_X_train = normalizer.transform(new_X_train)
# new_X_test = normalizer.transform(new_X_test)
ss = StandardScaler().fit(new_X_train)
new_X_train = ss.transform(new_X_train)
new_X_test = ss.transform(new_X_test)

new_X_T1_train = new_X_train[:, 0: 27]
new_X_T1_test = new_X_test[:, 0: 27]

In [None]:
import matplotlib.pyplot as plt


def plot_regression_results(ax, y_true, y_pred, title, scores):
    """预测目标与真实目标的散点图。"""
    ax.plot([y_true.min(), y_true.max()],
            [y_true.min(), y_true.max()], '--r', linewidth=2)
    ax.scatter(y_true, y_pred, alpha=0.2)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    ax.set_xlim([y_true.min(), y_true.max()])
    ax.set_ylim([y_true.min(), y_true.max()])
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    extra = plt.Rectangle((0, 0), 0, 0, fc="w", fill=False,
                          edgecolor='none', linewidth=0)
    ax.legend([extra], [scores], loc='upper left')
    ax.set_title(title)


def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

## 随机森林

In [None]:
rfr_theta_max = RandomForestRegressor()
rfr_theta_avg = RandomForestRegressor()
rfr_a_max = RandomForestRegressor()
rfr_a_avg = RandomForestRegressor()

# 贝叶斯优化
from skopt import BayesSearchCV

param_grid = {'max_depth': np.linspace(start=10, stop=500, num=50, dtype=int),
              'n_estimators': np.arange(start=50, stop=1000, step=50, dtype=int),
              'min_samples_leaf': np.array([2, 5, 10, 15], dtype=int),
              'min_samples_split': np.array([2, 5, 10, 15], dtype=int),
              'random_state': [3]}

opt_theta_max = BayesSearchCV(rfr_theta_max, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_theta_avg = BayesSearchCV(rfr_theta_avg, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_a_max = BayesSearchCV(rfr_a_max, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_a_avg = BayesSearchCV(rfr_a_avg, param_grid, n_iter=30, n_jobs=3, cv=3)

opt_theta_max.fit(new_X_train, y_train_theta_max)
opt_theta_avg.fit(new_X_train, y_train_theta_avg)
opt_a_max.fit(new_X_train, y_train_a_max)
opt_a_avg.fit(new_X_train, y_train_a_avg)

In [None]:
best_params_theta_max = opt_theta_max.best_params_
best_params_theta_max

In [None]:
best_params_theta_avg = opt_theta_avg.best_params_
best_params_theta_avg

In [None]:
best_params_a_max = opt_a_max.best_params_
best_params_a_max

In [None]:
best_params_a_avg = opt_a_avg.best_params_
best_params_a_avg

### theta_max

In [None]:
rfr_theta_max = RandomForestRegressor(max_depth=best_params_theta_max["max_depth"],
                                      n_estimators=best_params_theta_max["n_estimators"],
                                      min_samples_leaf=best_params_theta_max["min_samples_leaf"],
                                      min_samples_split=best_params_theta_max["min_samples_split"],
                                      random_state=best_params_theta_max["random_state"]
                                      )
rfr_theta_max.fit(new_X_train, y_train_theta_max)

train_pred_theta_max = rfr_theta_max.predict(new_X_train)
pred_theta_max = rfr_theta_max.predict(new_X_test)
print(rmsle(y_train_theta_max, train_pred_theta_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_max = r2_score(y_test_theta_max, pred_theta_max)
mse_theta_max = mean_squared_error(y_test_theta_max, pred_theta_max)
mape_theta_max = mean_absolute_percentage_error(y_test_theta_max, pred_theta_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_max, pred_theta_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_max, mse_theta_max, mape_theta_max))

main_features = plot_features_importances(rfr_theta_max.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

# y_train_theta_max = y_train[:, 0]
# y_test_theta_max = y_test[:, 0]

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

rfr_theta_max = RandomForestRegressor(max_depth=best_params_theta_max["max_depth"],
                                      n_estimators=best_params_theta_max["n_estimators"],
                                      min_samples_leaf=best_params_theta_max["min_samples_leaf"],
                                      min_samples_split=best_params_theta_max["min_samples_split"],
                                      random_state=best_params_theta_max["random_state"]
                                      )
rfr_theta_max.fit(new_X_main_train, y_train_theta_max)

train_pred_theta_max = rfr_theta_max.predict(new_X_main_train)
pred_theta_max = rfr_theta_max.predict(new_X_main_test)
print(rmsle(y_train_theta_max, train_pred_theta_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_max = r2_score(y_test_theta_max, pred_theta_max)
mse_theta_max = mean_squared_error(y_test_theta_max, pred_theta_max)
mape_theta_max = mean_absolute_percentage_error(y_test_theta_max, pred_theta_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_max, pred_theta_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_max, mse_theta_max, mape_theta_max))

### theta_avg

In [None]:
rfr_theta_avg = RandomForestRegressor(max_depth=best_params_theta_avg["max_depth"],
                                      n_estimators=best_params_theta_avg["n_estimators"],
                                      min_samples_leaf=best_params_theta_avg["min_samples_leaf"],
                                      min_samples_split=best_params_theta_avg["min_samples_split"],
                                      random_state=best_params_theta_avg["random_state"]
                                      )
rfr_theta_avg.fit(new_X_train, y_train_theta_avg)

train_pred_theta_avg = rfr_theta_avg.predict(new_X_train)
pred_theta_avg = rfr_theta_avg.predict(new_X_test)
print(rmsle(y_train_theta_avg, train_pred_theta_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_avg = r2_score(y_test_theta_avg, pred_theta_avg)
mse_theta_avg = mean_squared_error(y_test_theta_avg, pred_theta_avg)
mape_theta_avg = mean_absolute_percentage_error(y_test_theta_avg, pred_theta_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_avg, pred_theta_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_avg, mse_theta_avg, mape_theta_avg))

main_features = plot_features_importances(rfr_theta_avg.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

# y_train_theta_avg = y_train[:, 1]
# y_test_theta_avg = y_test[:, 1]

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

rfr_theta_avg = RandomForestRegressor(max_depth=best_params_theta_avg["max_depth"],
                                      n_estimators=best_params_theta_avg["n_estimators"],
                                      min_samples_leaf=best_params_theta_avg["min_samples_leaf"],
                                      min_samples_split=best_params_theta_avg["min_samples_split"],
                                      random_state=best_params_theta_avg["random_state"]
                                      )
rfr_theta_avg.fit(new_X_main_train, y_train_theta_avg)

train_pred_theta_avg = rfr_theta_avg.predict(new_X_main_train)
pred_theta_avg = rfr_theta_avg.predict(new_X_main_test)
print(rmsle(y_train_theta_avg, train_pred_theta_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_avg = r2_score(y_test_theta_avg, pred_theta_avg)
mse_theta_avg = mean_squared_error(y_test_theta_avg, pred_theta_avg)
mape_theta_avg = mean_absolute_percentage_error(y_test_theta_avg, pred_theta_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_avg, pred_theta_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_avg, mse_theta_avg, mape_theta_avg))

### a_max

In [None]:
rfr_a_max = RandomForestRegressor(max_depth=best_params_a_max["max_depth"],
                                  n_estimators=best_params_a_max["n_estimators"],
                                  min_samples_leaf=best_params_a_max["min_samples_leaf"],
                                  min_samples_split=best_params_a_max["min_samples_split"],
                                  random_state=best_params_a_max["random_state"]
                                  )
rfr_a_max.fit(new_X_train, y_train_a_max)

train_pred_a_max = rfr_a_max.predict(new_X_train)
pred_a_max = rfr_a_max.predict(new_X_test)
print(rmsle(y_train_a_max, train_pred_a_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_max = r2_score(y_test_a_max, pred_a_max)
mse_a_max = mean_squared_error(y_test_a_max, pred_a_max)
mape_a_max = mean_absolute_percentage_error(y_test_a_max, pred_a_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_max, pred_a_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_max, mse_a_max, mape_a_max))

main_features = plot_features_importances(rfr_a_max.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

# y_train_a_max = y_train[:, 2]
# y_test_a_max = y_test[:, 2]

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

rfr_a_max = RandomForestRegressor(max_depth=best_params_a_max["max_depth"],
                                  n_estimators=best_params_a_max["n_estimators"],
                                  min_samples_leaf=best_params_a_max["min_samples_leaf"],
                                  min_samples_split=best_params_a_max["min_samples_split"],
                                  random_state=best_params_a_max["random_state"]
                                  )
rfr_a_max.fit(new_X_main_train, y_train_a_max)

train_pred_a_max = rfr_a_max.predict(new_X_main_train)
pred_a_max = rfr_a_max.predict(new_X_main_test)
print(rmsle(y_train_a_max, train_pred_a_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_max = r2_score(y_test_a_max, pred_a_max)
mse_a_max = mean_squared_error(y_test_a_max, pred_a_max)
mape_a_max = mean_absolute_percentage_error(y_test_a_max, pred_a_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_max, pred_a_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_max, mse_a_max, mape_a_max))

### a_avg

In [None]:
rfr_a_avg = RandomForestRegressor(max_depth=best_params_a_avg["max_depth"],
                                  n_estimators=best_params_a_avg["n_estimators"],
                                  min_samples_leaf=best_params_a_avg["min_samples_leaf"],
                                  min_samples_split=best_params_a_avg["min_samples_split"],
                                  random_state=best_params_a_avg["random_state"]
                                  )
rfr_a_avg.fit(new_X_train, y_train_a_avg)

train_pred_a_avg = rfr_a_avg.predict(new_X_train)
pred_a_avg = rfr_a_avg.predict(new_X_test)
print(rmsle(y_train_a_avg, train_pred_a_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_avg = r2_score(y_test_a_avg, pred_a_avg)
mse_a_avg = mean_squared_error(y_test_a_avg, pred_a_avg)
mape_a_avg = mean_absolute_percentage_error(y_test_a_avg, pred_a_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_avg, pred_a_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_avg, mse_a_avg, mape_a_avg))

main_features = plot_features_importances(rfr_a_avg.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

# y_train_a_avg = y_train[:, 3]
# y_test_a_avg = y_test[:, 3]

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

rfr_a_avg = RandomForestRegressor(max_depth=best_params_a_avg["max_depth"],
                                  n_estimators=best_params_a_avg["n_estimators"],
                                  min_samples_leaf=best_params_a_avg["min_samples_leaf"],
                                  min_samples_split=best_params_a_avg["min_samples_split"],
                                  random_state=best_params_a_avg["random_state"]
                                  )
rfr_a_avg.fit(new_X_main_train, y_train_a_avg)

train_pred_a_avg = rfr_a_avg.predict(new_X_main_train)
pred_a_avg = rfr_a_avg.predict(new_X_main_test)
print(rmsle(y_train_a_avg, train_pred_a_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_avg = r2_score(y_test_a_avg, pred_a_avg)
mse_a_avg = mean_squared_error(y_test_a_avg, pred_a_avg)
mape_a_avg = mean_absolute_percentage_error(y_test_a_avg, pred_a_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_avg, pred_a_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_avg, mse_a_avg, mape_a_avg))

## XGBoost

In [None]:
xgbr_theta_max = XGBRegressor()
xgbr_theta_avg = XGBRegressor()
xgbr_a_max = XGBRegressor()
xgbr_a_avg = XGBRegressor()

# 贝叶斯优化
from skopt import BayesSearchCV

param_grid = {'learning_rate': np.array([0.01, 0.015, 0.025, 0.05, 0.1], dtype=float),
              'gamma': np.array([0, 0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 1], dtype=float),
              "reg_alpha": np.array([0, 0.01, 0.1, 1], dtype=float),
              "reg_lambda": np.array([0, 0.1, 0.5, 1], dtype=float),
              "min_child_weight": np.array([1, 3, 5, 7], dtype=int),
              "colsample_bytree": np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1], dtype=float),
              'subsample': np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1], dtype=float),
              'max_depth': np.array([3, 5, 8, 15, 25, 30], dtype=int),
              'n_estimators': np.linspace(start=50, stop=3000, num=60, dtype=int),
              'random_state': [3]}

opt_theta_max = BayesSearchCV(xgbr_theta_max, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_theta_avg = BayesSearchCV(xgbr_theta_avg, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_a_max = BayesSearchCV(xgbr_a_max, param_grid, n_iter=30, n_jobs=3, cv=3)
opt_a_avg = BayesSearchCV(xgbr_a_avg, param_grid, n_iter=30, n_jobs=3, cv=3)

opt_theta_max.fit(new_X_train, y_train_theta_max)
opt_theta_avg.fit(new_X_train, y_train_theta_avg)
opt_a_max.fit(new_X_train, y_train_a_max)
opt_a_avg.fit(new_X_train, y_train_a_avg)

In [None]:
best_params_theta_max = opt_theta_max.best_params_
best_params_theta_max

In [None]:
best_params_theta_avg = opt_theta_avg.best_params_
best_params_theta_avg

In [None]:
best_params_a_max = opt_a_max.best_params_
best_params_a_max

In [None]:
best_params_a_avg = opt_a_avg.best_params_
best_params_a_avg

### theta_max

In [None]:
xgbr_theta_max = XGBRegressor(n_estimators=best_params_theta_max["n_estimators"],
                              colsample_bytree=best_params_theta_max['colsample_bytree'],
                              max_depth=best_params_theta_max["max_depth"],
                              learning_rate=best_params_theta_max["learning_rate"],
                              gamma=best_params_theta_max["gamma"],
                              min_child_weight=best_params_theta_max["min_child_weight"],
                              reg_alpha=best_params_theta_max["reg_alpha"],
                              reg_lambda=best_params_theta_max["reg_lambda"],
                              subsample=best_params_theta_max["subsample"],
                              random_state=best_params_theta_max["random_state"])
xgbr_theta_max.fit(new_X_train, y_train_theta_max)

train_pred_theta_max = xgbr_theta_max.predict(new_X_train)
pred_theta_max = xgbr_theta_max.predict(new_X_test)
print(rmsle(y_train_theta_max, train_pred_theta_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_max = r2_score(y_test_theta_max, pred_theta_max)
mse_theta_max = mean_squared_error(y_test_theta_max, pred_theta_max)
mape_theta_max = mean_absolute_percentage_error(y_test_theta_max, pred_theta_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_max, pred_theta_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_max, mse_theta_max, mape_theta_max))

main_features = plot_features_importances(xgbr_theta_max.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

xgbr_theta_max = XGBRegressor(n_estimators=best_params_theta_max["n_estimators"],
                              colsample_bytree=best_params_theta_max['colsample_bytree'],
                              max_depth=best_params_theta_max["max_depth"],
                              learning_rate=best_params_theta_max["learning_rate"],
                              gamma=best_params_theta_max["gamma"],
                              min_child_weight=best_params_theta_max["min_child_weight"],
                              reg_alpha=best_params_theta_max["reg_alpha"],
                              reg_lambda=best_params_theta_max["reg_lambda"],
                              subsample=best_params_theta_max["subsample"],
                              random_state=best_params_theta_max["random_state"])
xgbr_theta_max.fit(new_X_main_train, y_train_theta_max)

train_pred_theta_max = xgbr_theta_max.predict(new_X_main_train)
pred_theta_max = xgbr_theta_max.predict(new_X_main_test)
print(rmsle(y_train_theta_max, train_pred_theta_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_max = r2_score(y_test_theta_max, pred_theta_max)
mse_theta_max = mean_squared_error(y_test_theta_max, pred_theta_max)
mape_theta_max = mean_absolute_percentage_error(y_test_theta_max, pred_theta_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_max, pred_theta_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_max, mse_theta_max, mape_theta_max))

### theta_avg

In [None]:
xgbr_theta_avg = XGBRegressor(n_estimators=best_params_theta_avg["n_estimators"],
                              colsample_bytree=best_params_theta_avg['colsample_bytree'],
                              max_depth=best_params_theta_avg["max_depth"],
                              learning_rate=best_params_theta_avg["learning_rate"],
                              gamma=best_params_theta_avg["gamma"],
                              min_child_weight=best_params_theta_avg["min_child_weight"],
                              reg_alpha=best_params_theta_avg["reg_alpha"],
                              reg_lambda=best_params_theta_avg["reg_lambda"],
                              subsample=best_params_theta_avg["subsample"],
                              random_state=best_params_theta_avg["random_state"])
xgbr_theta_avg.fit(new_X_train, y_train_theta_avg)

train_pred_theta_avg = xgbr_theta_avg.predict(new_X_train)
pred_theta_avg = xgbr_theta_avg.predict(new_X_test)
print(rmsle(y_train_theta_avg, train_pred_theta_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_avg = r2_score(y_test_theta_avg, pred_theta_avg)
mse_theta_avg = mean_squared_error(y_test_theta_avg, pred_theta_avg)
mape_theta_avg = mean_absolute_percentage_error(y_test_theta_avg, pred_theta_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_avg, pred_theta_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_avg, mse_theta_avg, mape_theta_avg))

main_features = plot_features_importances(xgbr_theta_avg.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

xgbr_theta_avg = XGBRegressor(n_estimators=best_params_theta_avg["n_estimators"],
                              colsample_bytree=best_params_theta_avg['colsample_bytree'],
                              max_depth=best_params_theta_avg["max_depth"],
                              learning_rate=best_params_theta_avg["learning_rate"],
                              gamma=best_params_theta_avg["gamma"],
                              min_child_weight=best_params_theta_avg["min_child_weight"],
                              reg_alpha=best_params_theta_avg["reg_alpha"],
                              reg_lambda=best_params_theta_avg["reg_lambda"],
                              subsample=best_params_theta_avg["subsample"],
                              random_state=best_params_theta_avg["random_state"])
xgbr_theta_avg.fit(new_X_main_train, y_train_theta_avg)

train_pred_theta_avg = xgbr_theta_avg.predict(new_X_main_train)
pred_theta_avg = xgbr_theta_avg.predict(new_X_main_test)
print(rmsle(y_train_theta_avg, train_pred_theta_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_theta_avg = r2_score(y_test_theta_avg, pred_theta_avg)
mse_theta_avg = mean_squared_error(y_test_theta_avg, pred_theta_avg)
mape_theta_avg = mean_absolute_percentage_error(y_test_theta_avg, pred_theta_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_theta_avg, pred_theta_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_theta_avg, mse_theta_avg, mape_theta_avg))

### a_max

In [None]:
xgbr_a_max = XGBRegressor(n_estimators=best_params_a_max["n_estimators"],
                          colsample_bytree=best_params_a_max['colsample_bytree'],
                          max_depth=best_params_a_max["max_depth"],
                          learning_rate=best_params_a_max["learning_rate"],
                          gamma=best_params_a_max["gamma"],
                          min_child_weight=best_params_a_max["min_child_weight"],
                          reg_alpha=best_params_a_max["reg_alpha"],
                          reg_lambda=best_params_a_max["reg_lambda"],
                          subsample=best_params_a_max["subsample"],
                          random_state=best_params_a_max["random_state"])
xgbr_a_max.fit(new_X_train, y_train_a_max)

train_pred_a_max = xgbr_a_max.predict(new_X_train)
pred_a_max = xgbr_a_max.predict(new_X_test)
print(rmsle(y_train_a_max, train_pred_a_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_max = r2_score(y_test_a_max, pred_a_max)
mse_a_max = mean_squared_error(y_test_a_max, pred_a_max)
mape_a_max = mean_absolute_percentage_error(y_test_a_max, pred_a_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_max, pred_a_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_max, mse_a_max, mape_a_max))

main_features = plot_features_importances(xgbr_a_max.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

xgbr_a_max = XGBRegressor(n_estimators=best_params_a_max["n_estimators"],
                          colsample_bytree=best_params_a_max['colsample_bytree'],
                          max_depth=best_params_a_max["max_depth"],
                          learning_rate=best_params_a_max["learning_rate"],
                          gamma=best_params_a_max["gamma"],
                          min_child_weight=best_params_a_max["min_child_weight"],
                          reg_alpha=best_params_a_max["reg_alpha"],
                          reg_lambda=best_params_a_max["reg_lambda"],
                          subsample=best_params_a_max["subsample"],
                          random_state=best_params_a_max["random_state"])
xgbr_a_max.fit(new_X_main_train, y_train_a_max)

train_pred_a_max = xgbr_a_max.predict(new_X_main_train)
pred_a_max = xgbr_a_max.predict(new_X_main_test)
print(rmsle(y_train_a_max, train_pred_a_max))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_max = r2_score(y_test_a_max, pred_a_max)
mse_a_max = mean_squared_error(y_test_a_max, pred_a_max)
mape_a_max = mean_absolute_percentage_error(y_test_a_max, pred_a_max)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_max, pred_a_max,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_max, mse_a_max, mape_a_max))

### a_avg

In [None]:
xgbr_a_avg = XGBRegressor(n_estimators=best_params_a_avg["n_estimators"],
                          colsample_bytree=best_params_a_avg['colsample_bytree'],
                          max_depth=best_params_a_avg["max_depth"],
                          learning_rate=best_params_a_avg["learning_rate"],
                          gamma=best_params_a_avg["gamma"],
                          min_child_weight=best_params_a_avg["min_child_weight"],
                          reg_alpha=best_params_a_avg["reg_alpha"],
                          reg_lambda=best_params_a_avg["reg_lambda"],
                          subsample=best_params_a_avg["subsample"],
                          random_state=best_params_a_avg["random_state"])
xgbr_a_avg.fit(new_X_train, y_train_a_avg)

train_pred_a_avg = xgbr_a_avg.predict(new_X_train)
pred_a_avg = xgbr_a_avg.predict(new_X_test)
print(rmsle(y_train_a_avg, train_pred_a_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_avg = r2_score(y_test_a_avg, pred_a_avg)
mse_a_avg = mean_squared_error(y_test_a_avg, pred_a_avg)
mape_a_avg = mean_absolute_percentage_error(y_test_a_avg, pred_a_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_avg, pred_a_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_avg, mse_a_avg, mape_a_avg))

main_features = plot_features_importances(xgbr_a_avg.feature_importances_, x_label)

# 使用主特征
X_main = df_ok.loc[:, main_features].values
X_main_train, X_main_test, y_train, y_test = train_test_split(X_main, y, test_size=0.1, random_state=2)
print(X_main_train.shape, X_main_test.shape, y_train.shape, y_test.shape)

new_X_main_train = X_main_train
new_X_main_test = X_main_test

ss_main = StandardScaler().fit(new_X_main_train)
new_X_main_train = ss_main.transform(new_X_main_train)
new_X_main_test = ss_main.transform(new_X_main_test)

xgbr_a_avg = XGBRegressor(n_estimators=best_params_a_avg["n_estimators"],
                          colsample_bytree=best_params_a_avg['colsample_bytree'],
                          max_depth=best_params_a_avg["max_depth"],
                          learning_rate=best_params_a_avg["learning_rate"],
                          gamma=best_params_a_avg["gamma"],
                          min_child_weight=best_params_a_avg["min_child_weight"],
                          reg_alpha=best_params_a_avg["reg_alpha"],
                          reg_lambda=best_params_a_avg["reg_lambda"],
                          subsample=best_params_a_avg["subsample"],
                          random_state=best_params_a_avg["random_state"])
xgbr_a_avg.fit(new_X_main_train, y_train_a_avg)

train_pred_a_avg = xgbr_a_avg.predict(new_X_main_train)
pred_a_avg = xgbr_a_avg.predict(new_X_main_test)
print(rmsle(y_train_a_avg, train_pred_a_avg))

from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_squared_error

r2_a_avg = r2_score(y_test_a_avg, pred_a_avg)
mse_a_avg = mean_squared_error(y_test_a_avg, pred_a_avg)
mape_a_avg = mean_absolute_percentage_error(y_test_a_avg, pred_a_avg)

plt.figure(figsize=(10, 8))
ax = plt.gca()
plot_regression_results(
    ax, y_test_a_avg, pred_a_avg,
    "Stacking Regressor",
    (r'$R^2={:.2f}$' + '\n' + r'$MAPE={:.2f}$' + '\n' + r'$MSE={:.2f}$')
    .format(r2_a_avg, mse_a_avg, mape_a_avg))