In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
import matplotlib.pyplot as plt


from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

from sklearn.model_selection import GridSearchCV, KFold

# 设置并固定随机种子

In [2]:
# 设置随机种子
np.random.seed(42)
random_state = 42

# 加载并清洗数据

In [3]:
# 1. 数据预处理
# 加载训练数据集（假设训练数据集文件名为 iris_train.csv）
train_df = pd.read_csv('./data/iris_train.csv')

# 数据清洗：检查缺失值和异常值
# 鸢尾花数据集通常没有缺失值和异常值，这里仅作为示例
train_df_cleaned = train_df.dropna()  # 删除缺失值

# 特征选择：选择所有特征
X_train = train_df_cleaned.drop(columns=['Species'])  # 标签列名为 'species'
y_train = train_df_cleaned['Species']

# 数据标准化/归一化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)


# 保存清洗后的训练数据到指定文件 

In [4]:

preprocessed_train_data = pd.concat([pd.DataFrame(X_train_scaled, columns=X_train.columns), pd.Series(y_train, name='species')], axis=1)
preprocessed_train_data.to_csv('./output/cleaned_train_data.csv', index=False)


# 训练标签转换

In [5]:
# 将字符串标签转换为整数标签
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)

# 反转
y_train_int_reverse = le.inverse_transform(y_train_int)
print(all(y_train_int_reverse==y_train))

True


# 数据打乱

In [6]:
# 打乱数据（可选，但在使用随机种子时可以确保一致性）
X_train_scaled_s, y_train_int_s = shuffle(X_train_scaled, y_train_int, random_state=random_state)

# 模型选择

In [24]:
# 定义模型列表和超参数网格
models_and_params = [
    {
        'name': 'Logistic Regression',
        'model': LogisticRegression(max_iter=200, random_state=random_state),
        'params': {'model__C': [0.1, 1, 10], 'model__penalty': ['l2']}    # 'model__penalty': ['l1', 'l2']
    },
    {
        'name': 'Decision Tree',
        'model': DecisionTreeClassifier(random_state=random_state),
        'params': {'model__max_depth': [5, 10, None], 'model__min_samples_split': [2, 5]}
    },
    {
        'name': 'Random Forest',
        'model': RandomForestClassifier(random_state=random_state),
        'params': {'model__n_estimators': [100, 200], 'model__max_depth': [5, 10, None]}
    },
    {
        'name': 'SVM',
        'model': SVC(random_state=random_state),
        'params': {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']}
    },
    {
        'name': 'K-Nearest Neighbors',
        'model': KNeighborsClassifier(),
        'params': {'model__n_neighbors': [3, 5, 7], 'model__weights': ['uniform', 'distance']}
    },
    {
        'name': 'Naive Bayes',
        'model': GaussianNB(),
        'params': {}  # 朴素贝叶斯没有超参数
    },
    {
        'name': 'Gradient Boosting',
        'model': GradientBoostingClassifier(random_state=random_state),
        'params': {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    },
    {
        'name': 'XGBoost',
        'model': XGBClassifier(eval_metric='mlogloss', random_state=random_state),
        'params': {'model__n_estimators': [100, 200], 'model__learning_rate': [0.01, 0.1], 'model__max_depth': [3, 5]}
    },
    {
        'name': 'MLP',
        'model': MLPClassifier(max_iter=1000, random_state=random_state),
        'params': {'model__hidden_layer_sizes': [(50,), (100,)], 'model__alpha': [0.0001, 0.001]}
    }
]

# 定义交叉验证策略
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 用于存储每个模型的最佳性能
best_models = []

# 遍历每个模型，进行网格搜索
for model_info in models_and_params:
    name = model_info['name']
    model = model_info['model']
    params = model_info['params']

    # 创建管道
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # 设置网格搜索
    grid_search = GridSearchCV(pipeline, params, cv=kf, scoring='accuracy')

    # 执行网格搜索
    grid_search.fit(X_train_scaled_s, y_train_int_s)

    # 存储最佳模型和性能
    best_models.append({
        'name': name,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'best_estimator': grid_search.best_estimator_
    })

# 找出并打印表现最好的模型
best_models_sorted = sorted(best_models, key=lambda x: x['best_score'], reverse=True)

# 打印每个模型的最佳超参数和性能
for model_info in best_models_sorted:
    print(f"Model: {model_info['name']}")
    print(f"Best Parameters: {model_info['best_params']}")
    print(f"Best Score: {model_info['best_score']:.4f}")
    print("-" * 40)

# 找出最佳模型
best_model_info = best_models_sorted[0]
print(f"\nThe best model is {best_model_info['name']} with an average accuracy of {best_model_info['best_score']:.4f}")
print(f"Best Parameters: {best_model_info['best_params']}")

Model: SVM
Best Parameters: {'model__C': 10, 'model__kernel': 'linear'}
Best Score: 0.9750
----------------------------------------
Model: MLP
Best Parameters: {'model__alpha': 0.0001, 'model__hidden_layer_sizes': (100,)}
Best Score: 0.9667
----------------------------------------
Model: K-Nearest Neighbors
Best Parameters: {'model__n_neighbors': 3, 'model__weights': 'distance'}
Best Score: 0.9583
----------------------------------------
Model: Logistic Regression
Best Parameters: {'model__C': 10, 'model__penalty': 'l2'}
Best Score: 0.9583
----------------------------------------
Model: Naive Bayes
Best Parameters: {}
Best Score: 0.9500
----------------------------------------
Model: XGBoost
Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}
Best Score: 0.9500
----------------------------------------
Model: Gradient Boosting
Best Parameters: {'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 100}
Best Score: 0.95

# 某次“模型选择”输出的结果

In [25]:
# MLP: 0.9750
# SVM: 0.9667
# Logistic Regression: 0.9583
# Naive Bayes: 0.9500
# Gradient Boosting: 0.9500
# Random Forest: 0.9417
# XGBoost: 0.9417
# Decision Tree: 0.9333
# K-Nearest Neighbors: 0.9000

# The best model is MLP with an average accuracy of 0.9750

# 模型训练
> 数据分类在数据挖掘中是一种常见场景，您可以尝试使用不同建模方式(决策树模型,随机森林，SVC。。。。。。)

In [37]:
# 2. 建模及模型训练
# 选择模型：支持向量机（SVM）
# model = SVC(kernel='linear', random_state=42)
# model = RandomForestClassifier()
# model = DecisionTreeClassifier()

# 直接使用最佳模型
best_model = best_model_info['best_estimator']

# 模型训练
# model.fit(X_train_scaled, y_train)
best_model.fit(X_train_scaled_s, y_train_int_s)

#print("支持向量:")
#print(model.support_vectors_)

# 打印截距
#print("截距:")
#print(model.intercept_)

# 打印支持向量的索引
#print("支持向量的索引:")
#print(model.support_)

# 打印每个类的支持向量数量
# print("每个类的支持向量数量:")
# print(model.n_support_)

In [38]:
# 计算训练集上的准确率
# train_accuracy = model.score(X_train_scaled, y_train)
train_accuracy = best_model.score(X_train_scaled_s, y_train_int_s)
print(f"训练集准确率: {train_accuracy:.2f}")

训练集准确率: 0.98


In [39]:
# 3. 模型推理
# 加载验证数据集（假设验证数据集文件名为 iris_val.csv）
val_df = pd.read_csv('./data/iris_test.csv')

# 数据清洗：检查缺失值和异常值
val_df_cleaned = val_df.dropna()  # 删除缺失值

# 特征选择：选择所有特征
X_val = val_df_cleaned.drop(columns=['Species'])  # 标签列名为 'species'

# 数据标准化/归一化
X_val_scaled = scaler.transform(X_val)

# 模型推理
y_pred = best_model.predict(X_val_scaled)

# 保存推理结果到指定目录(确保文件名，字段名正确)

In [40]:
# 保存推理结果
results = pd.DataFrame({'Sample_ID': range(len(y_pred)), 'Predicted_Species': y_pred})
results.to_csv('./output/test_data_predictions.csv', index=False)

# 试试检查您训练的结果
> 真实竞赛中会根据模型准确率评分，所以提供的推理数据不会包含标签列验证,您竞赛中不需要完成下面步骤

In [41]:
val_df_cleaned['pred']=y_pred
val_df_cleaned[['Species','pred']]
# 提取验证集的真实标签和预测结果
y_true = val_df_cleaned['Species']
y_pred = val_df_cleaned['pred']

## NOTE: 转化
y_true_int = le.transform(y_true)

# 评估模型
# accuracy = accuracy_score(y_true, y_pred)
# print(f"模型准确率: {accuracy:.2f}")
# print("分类报告:")
# print(classification_report(y_true, y_pred))

accuracy = accuracy_score(y_true_int, y_pred)
print(f"模型准确率: {accuracy:.2f}")
print("分类报告:")
print(classification_report(y_true_int, y_pred))

模型准确率: 0.93
分类报告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.85      1.00      0.92        11
           2       1.00      0.78      0.88         9

    accuracy                           0.93        30
   macro avg       0.95      0.93      0.93        30
weighted avg       0.94      0.93      0.93        30



# <font color="red">最后您需要将这个notebook保存到output目录(可以通过左边的文件管理栏,或文件菜单中的另存为)，供评分使用<font color="red">