In [1]:
  # 导包
  import numpy as np
  import pandas as pd
  import matplotlib.pyplot as plt

  from IPython.core.interactiveshell import InteractiveShell # 这个对象设置所有行全部输出

  # 设置该对象ast_node_interactivity的属性值为all，表示notebook下每一行有输出的代码全部输出运算结果
  InteractiveShell.ast_node_interactivity = "all"

  # 解决坐标轴刻度负号乱码
  plt.rcParams['axes.unicode_minus'] = False

  # 解决中文乱码问题
  plt.rcParams['font.sans-serif'] = ['Simhei']
  plt.style.use('ggplot')



In [2]:
# 加载数据
from sklearn.datasets import load_wine

# 获取数据
wine_data = load_wine()


# sklearn自带数据集的方法调用后不会直接返回dataset，而是先返回一个名为sklearn.utils._bunch.Bunch类型的类字典对象
# 在这个对象中，可以使用类似与字典的按键取值的方式来获取需要的数据集

data = pd.DataFrame(
  data = wine_data.data,
  columns = wine_data.feature_names
)

# 获取标签,标签列转化为Series对象
target = pd.Series(wine_data.target)

# 拼接
data = pd.concat([data, target], axis = 1)

# 修改标签列的列名
feature_names = list(wine_data.feature_names)
feature_names.append('target')
feature_names = np.array(feature_names)

data.columns = feature_names

data


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [6]:
# 准备数据
x = wine_data.data
y = wine_data.target

# 导入KNN分类器模型对象
from sklearn.neighbors import KNeighborsClassifier

# 完成数据集划分
from sklearn.model_selection import train_test_split # sklearn提供的数据集划分对象，可以从原始数据集中按照比例划分出训练集和数据集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x, y, test_size = 0.2, random_state = 56) # 使用该API划分得出：训练集数据及标签，测试集数据及标签


# 构建KNN分类器模型
clf = KNeighborsClassifier(n_neighbors = 3)

# 模型训练
clf = clf.fit(Xtrain, Ytrain)

# 模型推理
y_pred = clf.predict(Xtest)

# 模型评估
clf.score(Xtest, Ytest)

# predict_proba:返回一组测试样本经过模型预测后的概率分布
pd.DataFrame(clf.predict_proba(Xtest))

# 当前数据集标签的类别个数
pd.DataFrame(y).value_counts()

0.8611111111111112

Unnamed: 0,0,1,2
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.333333,0.333333,0.333333
4,0.0,1.0,0.0
5,1.0,0.0,0.0
6,0.0,1.0,0.0
7,0.0,0.0,1.0
8,0.666667,0.0,0.333333
9,1.0,0.0,0.0


1    71
0    59
2    48
Name: count, dtype: int64