In [10]:
from __future__ import print_function
import sys
import os
import math
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import datasets
%matplotlib inline

iris = datasets.load_iris()

def shuffle_data(X, y, seed=None):
    if seed:
        #这个作用是每次产生一样的随机数
        np.random.seed(seed)
    #x.shape[0]返回的是行数，以前老师好像说过这里面其实每行是一条数据
    #np.arrange返回的是array([0,1,2,3,4])
    idx = np.arange(X.shape[0])
    #array内序号打乱
    np.random.shuffle(idx)
    return X[idx], y[idx]



# 正规化数据集 X
def normalize(X, axis=-1, p=2):
    #np.linalg.norm按行求向量的范数 ord=2是求2范数 axis=1按行处理
    #np.atleast_1d就是将标量化为1维数组，高维保持
    #lp_norm就是存了X的每一条数据的l2范数
    lp_norm = np.atleast_1d(np.linalg.norm(X, p, axis))
    
    lp_norm[lp_norm == 0] = 1
    #np.expand_dims是将数组按0或1 行或列扩展一个维度
    return X / np.expand_dims(lp_norm, axis)


# 标准化数据集 X
def standardize(X):
    #给x的标准差准备一个全是0的数组
    X_std = np.zeros(X.shape)
    #按行（每一条数据）求均值和方差
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    
    # 做除法运算时请永远记住分母不能等于0的情形
    # X_std = (X - X.mean(axis=0)) / X.std(axis=0) 
    for col in range(np.shape(X)[1]):
        if std[col]:
            X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]

    return X_std


# 划分数据集为训练集和测试集
def train_test_split(X, y, test_size=0.2, shuffle=True, seed=None):
    if shuffle:
        X, y = shuffle_data(X, y, seed)

    n_train_samples = int(X.shape[0] * (1-test_size))
    x_train, x_test = X[:n_train_samples], X[n_train_samples:]
    y_train, y_test = y[:n_train_samples], y[n_train_samples:]
    
    return x_train, x_test, y_train, y_test

def accuracy(y, y_pred):
    y = y.reshape(y.shape[0], -1)
    y_pred = y_pred.reshape(y_pred.shape[0], -1)
    return np.sum(y == y_pred)/len(y)


class KNN():
    """ K近邻分类算法.

    Parameters:
    -----------
    k: int
        最近邻个数.
    """
    def __init__(self, k=5):
        self.k = k

    # 计算一个样本与训练集中所有样本的欧氏距离的平方
    def euclidean_distance(self, one_sample, X_train):
        #-1代表不知道的属性
        one_sample = one_sample.reshape(1, -1)
        #把x_train数据的形状重设置为行数和未知列数
        X_train = X_train.reshape(X_train.shape[0], -1)
        #np.tile将one_sample重复train行 列数就是性值保持 与 train的矩阵
        #按行算出欧几里得距离
        distances = np.power(np.tile(one_sample, (X_train.shape[0], 1)) - X_train, 2).sum(axis=1)
        return distances
    
    # 获取k个近邻的类别标签
    def get_k_neighbor_labels(self, distances, y_train, k):
        k_neighbor_labels = []
        for distance in np.sort(distances)[:k]:
            #难不成是有距离一样的内容 distances==distance这种方式很迷？
            label = y_train[distances==distance]
            print(label)
            #print(label[0])
            if(label.shape[0]>1):
                label=label.tolist()
            k_neighbor_labels.extend(label)

        return np.array(k_neighbor_labels).reshape(-1, )
    
    # 进行标签统计，得票最多的标签就是该测试样本的预测标签
    def vote(self, one_sample, X_train, y_train, k):
        #算出测试样本点和训练集中所有样本点的距离的数组
        distances = self.euclidean_distance(one_sample, X_train)
        #print(distances.shape)
        y_train = y_train.reshape(y_train.shape[0], 1)
        #取距离前k个点的标签
        k_neighbor_labels = self.get_k_neighbor_labels(distances, y_train, k)
        #print(k_neighbor_labels)
        find_label, find_count = 0,0
        #Counter类是用来统计标签出现的次数的，Counter({'c': 3, 'a': 2, 'b': 2})
        for label, count in Counter(k_neighbor_labels).items():
            if count > find_count:
                find_count = count
                find_label = label
        return find_label
    
    # 对测试集进行预测
    def predict(self, X_test, X_train, y_train):
        y_pred = []
        for sample in X_test:
            label = self.vote(sample, X_train, y_train, self.k)
            y_pred.append(label)
        #print(y_pred)
        return np.array(y_pred)


def main():
    #data = make_classification(n_samples=200, n_features=4, n_informative=2, 
    #                           n_redundant=2, n_repeated=0, n_classes=2)
    X = iris.data
    y = iris.target
    print(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)

    
    clf = KNN(k=5)
    y_pred = clf.predict(X_test, X_train, y_train)
    
    accu = accuracy(y_test, y_pred)
    print ("Accuracy:", accu)
    

if __name__ == "__main__":
    main()



[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]
 [ 5.4  3.9  1.7  0.4]
 [ 4.6  3.4  1.4  0.3]
 [ 5.   3.4  1.5  0.2]
 [ 4.4  2.9  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.4  3.7  1.5  0.2]
 [ 4.8  3.4  1.6  0.2]
 [ 4.8  3.   1.4  0.1]
 [ 4.3  3.   1.1  0.1]
 [ 5.8  4.   1.2  0.2]
 [ 5.7  4.4  1.5  0.4]
 [ 5.4  3.9  1.3  0.4]
 [ 5.1  3.5  1.4  0.3]
 [ 5.7  3.8  1.7  0.3]
 [ 5.1  3.8  1.5  0.3]
 [ 5.4  3.4  1.7  0.2]
 [ 5.1  3.7  1.5  0.4]
 [ 4.6  3.6  1.   0.2]
 [ 5.1  3.3  1.7  0.5]
 [ 4.8  3.4  1.9  0.2]
 [ 5.   3.   1.6  0.2]
 [ 5.   3.4  1.6  0.4]
 [ 5.2  3.5  1.5  0.2]
 [ 5.2  3.4  1.4  0.2]
 [ 4.7  3.2  1.6  0.2]
 [ 4.8  3.1  1.6  0.2]
 [ 5.4  3.4  1.5  0.4]
 [ 5.2  4.1  1.5  0.1]
 [ 5.5  4.2  1.4  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 5.   3.2  1.2  0.2]
 [ 5.5  3.5  1.3  0.2]
 [ 4.9  3.1  1.5  0.1]
 [ 4.4  3.   1.3  0.2]
 [ 5.1  3.4  1.5  0.2]
 [ 5.   3.5  1.3  0.3]
 [ 4.5  2.3  1.3  0.3]
 [ 4.4  3.2  1.3  0.2]
 [ 5.   3.5