# 中等项目2：超参数调优系统

## 学习目标
- 理解不同超参数搜索方法的原理
- 使用网格搜索进行超参数调优
- 使用随机搜索进行超参数调优
- 使用贝叶斯优化进行超参数调优
- 分析超参数对模型性能的影响


In [None]:
# 导入必要的库
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
from scipy.stats import randint, uniform
try:
    import optuna
    OPTUNA_AVAILABLE = True
except ImportError:
    OPTUNA_AVAILABLE = False
    print("警告: optuna未安装，贝叶斯优化功能将不可用。安装命令: pip install optuna")

# 设置中文字体和样式
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
np.random.seed(42)

print("环境准备完成！")


## 1. 数据准备


In [None]:
# 加载Iris数据集
iris = load_iris()
X, y = iris.data, iris.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("=" * 60)
print("数据信息")
print("=" * 60)
print(f"训练集样本数: {len(X_train)}")
print(f"测试集样本数: {len(X_test)}")
print(f"特征数: {X.shape[1]}")
print(f"类别数: {len(np.unique(y))}")


## 2. 网格搜索（Grid Search）

网格搜索会遍历所有参数组合，找到最佳参数。


In [None]:
# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10]
}

# 创建基础模型
rf = RandomForestClassifier(random_state=42)

# 网格搜索
print("=" * 60)
print("开始网格搜索...")
print("=" * 60)
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', 
                           n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"\n最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
print(f"总参数组合数: {len(grid_search.cv_results_['params'])}")


## 3. 随机搜索（Random Search）

随机搜索从参数分布中随机采样，通常比网格搜索更高效。


In [None]:
# 定义参数分布
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [3, 5, 7, None],
    'min_samples_split': randint(2, 10)
}

# 随机搜索
print("=" * 60)
print("开始随机搜索...")
print("=" * 60)
random_search = RandomizedSearchCV(rf, param_dist, n_iter=20, cv=5, 
                                   scoring='accuracy', random_state=42, 
                                   n_jobs=-1, verbose=1)
random_search.fit(X_train, y_train)

print(f"\n最佳参数: {random_search.best_params_}")
print(f"最佳交叉验证得分: {random_search.best_score_:.4f}")
print(f"尝试的参数组合数: {len(random_search.cv_results_['params'])}")


## 4. 贝叶斯优化（Optuna）

贝叶斯优化使用历史实验结果来指导下一步的参数选择，通常比随机搜索更高效。


In [None]:
if OPTUNA_AVAILABLE:
    # 定义目标函数
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'random_state': 42
        }
        
        model = RandomForestClassifier(**params)
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
        return scores.mean()
    
    # 运行优化
    print("=" * 60)
    print("开始贝叶斯优化...")
    print("=" * 60)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20, show_progress_bar=True)
    
    print(f"\n最佳参数: {study.best_params}")
    print(f"最佳得分: {study.best_value:.4f}")
    print(f"尝试的参数组合数: {len(study.trials)}")
else:
    print("跳过贝叶斯优化（optuna未安装）")
    study = None


## 5. 结果对比和可视化

比较不同搜索方法找到的最佳模型在测试集上的性能。


In [None]:
# 准备最佳模型
best_models = {
    '网格搜索': grid_search.best_estimator_,
    '随机搜索': random_search.best_estimator_
}

if OPTUNA_AVAILABLE and study:
    best_models['贝叶斯优化'] = RandomForestClassifier(**study.best_params, random_state=42)

# 在测试集上评估
results = {}
print("=" * 60)
print("测试集评估结果")
print("=" * 60)

for name, model in best_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        'accuracy': accuracy,
        'cv_score': grid_search.best_score_ if name == '网格搜索' else 
                   (random_search.best_score_ if name == '随机搜索' else study.best_value)
    }
    print(f"{name}:")
    print(f"  交叉验证得分: {results[name]['cv_score']:.4f}")
    print(f"  测试集准确率: {accuracy:.4f}")

# 可视化对比
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 左图：交叉验证得分对比
ax1 = axes[0]
names = list(results.keys())
cv_scores = [results[name]['cv_score'] for name in names]
ax1.bar(names, cv_scores, alpha=0.7, color=['skyblue', 'lightgreen', 'lightcoral'][:len(names)])
ax1.set_ylabel('交叉验证得分', fontsize=12)
ax1.set_title('交叉验证得分对比', fontsize=14)
ax1.grid(True, axis='y', alpha=0.3)
for i, (name, score) in enumerate(zip(names, cv_scores)):
    ax1.text(i, score + 0.01, f'{score:.4f}', ha='center', va='bottom')

# 右图：测试集准确率对比
ax2 = axes[1]
test_accuracies = [results[name]['accuracy'] for name in names]
ax2.bar(names, test_accuracies, alpha=0.7, color=['skyblue', 'lightgreen', 'lightcoral'][:len(names)])
ax2.set_ylabel('测试集准确率', fontsize=12)
ax2.set_title('测试集准确率对比', fontsize=14)
ax2.grid(True, axis='y', alpha=0.3)
for i, (name, acc) in enumerate(zip(names, test_accuracies)):
    ax2.text(i, acc + 0.01, f'{acc:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('hyperparameter_tuning_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n" + "=" * 60)
print("总结")
print("=" * 60)
print("1. 网格搜索：遍历所有参数组合，最全面但最耗时")
print("2. 随机搜索：随机采样参数，通常比网格搜索更高效")
print("3. 贝叶斯优化：使用历史结果指导搜索，通常最高效")
print("\n根据数据规模和计算资源选择合适的搜索方法！")
