In [5]:
import pandas as pd
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载训练数据
train_df = pd.read_csv('./data/iris_train.csv')

# 指定目标列
y = 'Species'  # 假设标签列名为 'Species'
X_train = train_df.drop(columns=[y])
y_train = train_df[y]

# 加载测试数据
test_df = pd.read_csv('./data/iris_test.csv')

# 指定目标列
X_test = test_df.drop(columns=[y])
y_test = test_df[y]  # 假设测试数据也包含标签

# 创建 TPOT 分类器
tpot = TPOTClassifier(generations=5,  # 设置遗传算法的代数
                      population_size=10,  # 设置每一代的个体数量
                      verbosity=2,  # 设置输出详细程度
                      random_state=42,  # 设置随机种子
                      scoring='accuracy',  # 选择评估指标
                      n_jobs=-1)  # 使用所有可用的 CPU 核心

# 训练 TPOT 模型
tpot.fit(X_train, y_train)

# 查看最佳模型
print("Best model found by TPOT:")
print(tpot.fitted_pipeline_)

# 使用最佳模型在测试集上进行预测
predictions = tpot.predict(X_test)

# 计算测试集上的准确率
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy on the test set: {accuracy:.4f}")

# 将预测结果保存到 CSV 文件
test_df['predicted_species'] = predictions
test_df.to_csv('./output/iris_test_predictions.csv', index=False)

# 打印预测结果
print("Predictions on the test set:")
print(test_df[['Species', 'predicted_species']].head())

                                                                            
Generation 1 - Current best internal CV score: 0.95
                                                                            
Generation 2 - Current best internal CV score: 0.9583333333333334
                                                                            
Generation 3 - Current best internal CV score: 0.9583333333333334
                                                                            
Generation 4 - Current best internal CV score: 0.9583333333333334
                                                                            
Generation 5 - Current best internal CV score: 0.9583333333333334
                                                                            
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.8, min_samples_leaf=19, min_samples_split=9, n_estimators=100)
Best model found by TPOT:
Pipeline(steps=[('extratreesclas