# ***模型融合与参数寻优***

In [1]:
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, VotingRegressor

In [2]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

## 模型融合

### **分类模型融合——基于Voting**

In [3]:
# 加载数据集
iris = pd.read_excel("data\\iris.xlsx")
X = iris.drop("class", axis=1)
y = iris['class']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# 初始化分类器
svc = SVC(kernel='linear', probability=True)
dtc = DecisionTreeClassifier()
gnb = GaussianNB()
knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression()

# 创建VotingClassifier，设置voting为'soft'表示使用概率平均
eclf = VotingClassifier(estimators=[
    ('svc', svc), 
    ('dtc', dtc), 
    ('gnb', gnb), 
    ('knn', knn), 
    ('lr', lr)], 
    voting='soft')

# 训练模型
predictions = []
for model in [svc, dtc, gnb, knn, lr, eclf]:
    y_pred = model.fit(X_train, y_train).predict(X_test)
    predictions.append(y_pred)
    print(accuracy_score(y_test, y_pred))

1.0
1.0
0.9777777777777777
1.0
1.0
1.0


### **回归模型——基于Voting**

In [5]:
# 加载数据集
boston = pd.read_excel("data\\boston_house_prices.xlsx")
X = boston.drop("MEDV", axis=1)
y = boston['MEDV']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# 初始化分类器
svm = SVR()
dtc = DecisionTreeRegressor()
knn = KNeighborsRegressor(n_neighbors=3)
lr = LinearRegression()

# 创建VotingClassifier，设置voting为'soft'表示使用概率平均
ereg = VotingRegressor(estimators=[
    ('svm', svm), 
    ('dtc', dtc), 
    ('knn', knn), 
    ('lr', lr)])

# 训练模型
predictions = []
for model in [svm, dtc, knn, lr, ereg]:
    y_pred = model.fit(X_train, y_train).predict(X_test)
    predictions.append(y_pred)
    print(mean_squared_error(y_test, y_pred))

53.50370874750568
11.81013157894737
28.149334795321632
21.51744423117709
15.322712769780267


## 参数寻优：基于GridSearchCV


In [7]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# 尽量把学习率设置的高点，这样迭代的时间会短些
# 此处以回归器为例，分类器同理
xgb_reg = xgb.XGBRegressor(learning_rate=0.1)

param1 = {
        "max_depth":[4, 7, 10],
        'min_child_weight': [1, 3, 5],
        "n_estimators":[10, 50, 100],
        'gamma': [0.1, 0.3, 0.5],
        'reg_alpha': [0.05, 0.1, 1], 
        'reg_lambda': [0.05, 0.1, 1]
        }

grid_search1 = GridSearchCV(xgb_reg, n_jobs=-1, param_grid=param1, cv=5)
grid_search1.fit(X_train, y_train)


grid_search1.best_estimator_, grid_search1.best_score_

(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.5, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 0.8480527423497846)

In [9]:
xgb_reg_PLUS = grid_search1.best_estimator_
y_pred = xgb_reg_PLUS.fit(X_train, y_train).predict(X_test)

In [None]:
import lightgbm as lgb

lgb_reg = lgb.LGBMRegressor(learning_rate=0.1)
param2 = {
        "max_depth":[4, 7, 10],
        "num_leaves":[300, 600, 900],
        "n_estimators":[10, 70, 130],
        'min_child_samples': [18, 20, 22],
        'min_child_weight':[0.001, 0.002]
        }

grid_search2 = GridSearchCV(lgb_reg, n_jobs=-1, param_grid=param2, cv=5)
grid_search2.fit(X_train, y_train)

grid_search2.best_estimator_, grid_search1.best_score_

In [None]:
lgb_reg_PLUS = grid_search2.best_estimator_
y1_pred = lgb_reg_PLUS.fit(X1_train, y1_train).predict(X1_test)