In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=42)

In [4]:
# Pipeline 模块的核心作用：一站式流程封装、避免数据泄露、统一接口与调参 (类似渲染管线？)
# 定义不同模型的Pipeline
pipelines = {
    'KNN': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', KNeighborsClassifier())
    ]),
    'LogisticRegression': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression())
    ]),
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', SVC())
    ]),
    'DecisionTree': Pipeline([
        ("classifier", DecisionTreeClassifier(random_state=42))  # 决策树不需要标准化
    ])
}

# 训练并评估
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"accuracy_score: {acc:.4f}")

accuracy_score: 1.0000
accuracy_score: 1.0000
accuracy_score: 1.0000
accuracy_score: 1.0000


使用Pipeline及GridSearchCV参数网格搜索模型，结合随机森林算法构建模型对鸢尾花类别做预测并得出最优模型的Accuracy评估指标

In [6]:
rf_pipeline = Pipeline([
    ('classifier', RandomForestClassifier(random_state = 42))
])

params = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(rf_pipeline, params, cv = 5, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

# 输出最优结果
print("最优参数组合:", grid_search.best_params_)
print("最优模型准确率:", grid_search.best_score_)

最优参数组合: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
最优模型准确率: 0.9428571428571428


In [7]:
# 在测试集上评估最优模型
best_model = grid_search.best_estimator_
test_acc = best_model.score(X_test, y_test)
print("测试集准确率:", test_acc)

测试集准确率: 1.0
