In [1]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rng = np.random.RandomState(2020)

## 加载数据

In [2]:
iris_data = pd.read_csv("./Dataset/iris.csv")
X = iris_data.drop('class', axis = 1)
y = iris_data['class']

## 类别标签编码

In [3]:
from sklearn.preprocessing import LabelEncoder
class_encoding = LabelEncoder()
y_numeric = class_encoding.fit_transform(y)

## 划分训练集测试集

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_numeric,
                                                    test_size=0.2, random_state=rng)

## 特征标准化

In [5]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

## 模型评估

In [6]:
from sklearn.metrics import accuracy_score
def print_performance(real, pred):
    print('误分样本数（Misclassified samples）: {}'.format((real != pred).sum()))
    print('准确率(accuracy): {:.2f}%'.format(accuracy_score(real, pred)*100))

In [7]:
from sklearn.metrics import classification_report, confusion_matrix

## 近邻法

In [8]:
clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print_performance(y_test, y_test_pred)

误分样本数（Misclassified samples）: 3
准确率(accuracy): 90.00%


## 决策树

In [9]:
clf = DecisionTreeClassifier(criterion='gini', 
                              max_depth=4, 
                              random_state=1)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print_performance(y_test, y_test_pred)

误分样本数（Misclassified samples）: 4
准确率(accuracy): 86.67%


## 随机森林

In [10]:
clf = RandomForestClassifier(criterion='gini',
                                n_estimators=500, 
                                random_state=rng)
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print_performance(y_test, y_test_pred)

误分样本数（Misclassified samples）: 5
准确率(accuracy): 83.33%


## 朴素贝叶斯

In [11]:
clf = GaussianNB()
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
print_performance(y_test, y_test_pred)

误分样本数（Misclassified samples）: 5
准确率(accuracy): 83.33%


## 支持向量机

In [12]:
model = SVC(kernel='linear',
            C=1.0,
            random_state=rng)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
print_performance(y_test, y_test_pred)

误分样本数（Misclassified samples）: 3
准确率(accuracy): 90.00%
