In [22]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale
names = ['letter','x-box','y-box','width','high','onpix','x-bar','y-bar','x2bar',
         'y2bar','xybar','x2ybr','xy2br','x-ege','xegvy','y-ege','yegvx']
dataset = pd.read_csv("letter-recognition.data",names = names)
X = dataset.drop("letter", axis = 1)
y = dataset['letter']
X_scaled = scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 101)

In [41]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 定义三个分类器
svm_clf = SVC(C=1000, gamma=0.01, kernel="rbf",probability=True)
dt_clf = DecisionTreeClassifier(max_depth = 27)
knn_clf = model = KNeighborsClassifier(n_neighbors=1)

# 训练三个分类器
svm_clf.fit(X_train, y_train)
dt_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)

# 对测试集进行预测
svm_pred = svm_clf.predict(X_test)
dt_pred = dt_clf.predict(X_test)
knn_pred = knn_clf.predict(X_test)

# 对测试集进行预测
svm_prob = svm_clf.predict_proba(X_test)
dt_prob = dt_clf.predict_proba(X_test)
knn_prob = knn_clf.predict_proba(X_test)

# 集成预测
final_pred = []
for i in range(len(X_test)):
    preds = [svm_pred[i], dt_pred[i], knn_pred[i]]
    # 如果三个预测相同，则使用该预测值
    if preds[0] == preds[1] or preds[0] == preds[2]:
        final_pred.append(preds[0])
    elif preds[1] == preds[2]:
        final_pred.append(preds[1])
    else:
        # 加权方式，可以改为不同的加权策略
        # 为每个分类器分配权重
        weights = [0.7, 0.3,0]
        final_prob = (weights[0] * svm_prob[i] + weights[1] * dt_prob[i] + weights[2] * knn_prob[i])
        # 选择加权平均概率最高的类作为最终预测结果
        final_pred.append(np.argmax(final_prob, axis=0))

# 计算准确性
accuracy = accuracy_score(y_test, final_pred)
print(f'Ensemble model accuracy: {accuracy:.5f}')

In [42]:
from sklearn.model_selection import GridSearchCV

# 定义参数网格
param_grid = {
    'svm__C': [1000],
    'svm__kernel': ['rbf'],
    'dt__max_depth': [10, 20, 30],
    'knn__n_neighbors': [1,3, 5, 7]
}

# 创建集成模型的Pipeline
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

# 定义基分类器
svm_clf = SVC(probability=True)
dt_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

# 创建VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('svm', svm_clf),
    ('dt', dt_clf),
    ('knn', knn_clf)
], voting='soft')

# 使用网格搜索
grid_search = GridSearchCV(voting_clf, param_grid)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)


In [43]:
# 计算准确性
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble model accuracy: {accuracy:.5f}')

In [45]:
# cv results
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results

In [46]:

# 打印最佳参数和最佳交叉验证得分
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# 打印测试集准确率
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy:", test_accuracy)