In [1]:
import time
import struct
import numpy as np
import pandas as pd
from numpy import *
from pyecharts.charts import Bar
from pyecharts import options as opts
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN

In [2]:
def read_image(file_name):
    #先用二进制方式把文件都读进来
    file_handle=open(file_name,"rb")  #以二进制打开文档
    file_content=file_handle.read()   #读取到缓冲区中
    offset=0
    head = struct.unpack_from('>IIII', file_content, offset)
    offset += struct.calcsize('>IIII')
    imgNum = head[1]  #图片数
    rows = head[2]   #宽度
    cols = head[3]  #高度
    
    #empty，是它所常见的数组内的所有元素均为空
    #没有实际意义，创建数组最快的方法
    images=np.empty((imgNum , 784))
    image_size=rows*cols
    fmt='>' + str(image_size) + 'B'

    for i in range(imgNum):#图片数
        images[i] = np.array(struct.unpack_from(fmt, file_content, offset))
        offset += struct.calcsize(fmt)
    # print(images)
    return images

#读取标签
def read_label(file_name):
    file_handle = open(file_name, "rb")  
    file_content = file_handle.read()  
    head = struct.unpack_from('>II', file_content, 0) 
    offset = struct.calcsize('>II')
    # print(head)
    labelNum = head[1]  # label数
    # print(labelNum)
    bitsString = '>' + str(labelNum) + 'B'  
    label = struct.unpack_from(bitsString, file_content, offset) 
    # print(np.array(label))
    return np.array(label)

def loadDataSet():
    #mnist
    train_x_filename="./data/t10k-images.idx3-ubyte" # 路径
    train_y_filename="./data/t10k-labels.idx1-ubyte"
    test_x_filename="./data/train-images.idx3-ubyte"
    test_y_filename="./data/train-labels.idx1-ubyte"

    #fashion mnist
    # train_x_filename="fashion-train-images-idx3-ubyte"
    # train_y_filename="fashion-train-labels-idx1-ubyte"
    # test_x_filename="fashion-t10k-images-idx3-ubyte"
    # test_y_filename="fashion-t10k-labels-idx1-ubyte"

    train_x=read_image(train_x_filename)#60000*784 的矩阵
    train_y=read_label(train_y_filename)#60000*1的矩阵
    test_x=read_image(test_x_filename)#10000*784
    test_y=read_label(test_y_filename)#10000*1

    train_x=normalize(train_x)
    test_x=normalize(test_x)

    # 减少数据集大小
    # train_x=train_x[0:1000,:]
    # train_y=train_y[0:1000]
    # test_x=test_x[0:500,:]
    # test_y=test_y[0:500]

    return train_x, test_x, train_y, test_y

def  normalize(data):#图片像素二值化，变成0-1分布
    m=data.shape[0]
    n=np.array(data).shape[1]
    for i in range(m):
        for j in range(n):
            if data[i,j]!=0:
                data[i,j]=1
            else:
                data[i,j]=0
    return data

In [3]:
#K近邻算法
if __name__=='__main__':
    print("开始处理读取数据...")
    time1 = time.time()
    
    #划分训练集和测试集
    x_train, x_test, y_train, y_test = loadDataSet()

    #图片像素二值化，变成0-1分布
    x_train = normalize(x_train)
    x_test = normalize(x_test)

    time2 = time.time()
    print("处理读取数据花费的时间：",time2-time1,"秒")
    print('----------------------------------------------')
    print("开始K近邻法的运算...")

    knn = KNN(n_neighbors=3)
    knn.fit(x_train,y_train)
    print("运算完成")
    print('----------------------------------------------')
    print("开始预测数据...")
    time3 = time.time()
    answer = knn.predict(x_test)
    time4 = time.time()
    print("预测数据花费的时间：",time4-time3,"秒")
    print('----------------------------------------------')
    print("开始计算精度...")
    scorek = accuracy_score(answer,y_test).round(4)
    time5 = time.time()
    timek = (time5 - time2)
    print('K近邻算法预测分类精度：{}'.format(scorek))
    print("计算精度花费的时间：",time5-time4,"秒")

开始处理读取数据...
处理读取数据花费的时间： 42.12405753135681 秒
----------------------------------------------
开始K近邻法的运算...
运算完成
----------------------------------------------
开始预测数据...
预测数据花费的时间： 16.198400735855103 秒
----------------------------------------------
开始计算精度...
K近邻算法预测分类精度：0.9425
计算精度花费的时间： 0.024614572525024414 秒


In [4]:
# 朴素贝叶斯模型
time1 = time.time()
nb_clf = GaussianNB()
nb_clf.fit(x_train,y_train)
y_pred = nb_clf.predict(x_test)
scoren = accuracy_score(y_pred,y_test).round(4)
time2 = time.time()
timen = (time2 - time1)

# 逻辑回归模型分类
time1 = time.time()
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
scorel = accuracy_score(y_pred,y_test).round(4)
time2 = time.time()
timel = (time2 - time1)

# 决策树模型分类
time1 = time.time()
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
scorej = accuracy_score(y_pred,y_test).round(4)
time2 = time.time()
timej = (time2 - time1)

# 随机森林模型分类
time1 = time.time()
model = RandomForestClassifier()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
scores = accuracy_score(y_pred,y_test).round(4)
time2 = time.time()
times = (time2 - time1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
print("随机森林模型预测结果与实际值对比：")
a = pd.DataFrame()
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a

随机森林模型预测结果与实际值对比：


Unnamed: 0,预测值,实际值
0,5,5
1,0,0
2,4,4
3,1,1
4,9,9
...,...,...
59995,8,8
59996,3,3
59997,5,5
59998,6,6


In [6]:
print('K近邻算法预测分类精度：{}'.format(scorek))
print('朴素贝叶斯模型预测分类精度：{}'.format(scoren))
print('逻辑回归模型预测分类精度：{}'.format(scorel))
print('决策树模型预测分类精度：{}'.format(scorej))
print('随机森林模型预测分类精度：{}'.format(scores))

K近邻算法预测分类精度：0.9425
朴素贝叶斯模型预测分类精度：0.5322
逻辑回归模型预测分类精度：0.8825
决策树模型预测分类精度：0.8134
随机森林模型预测分类精度：0.9476


In [7]:
label = ['K近邻','朴素贝叶斯','逻辑回归','决策树','随机森林']
score_kh = [scorek,scoren,scorel,scorej,scores]
time_kh = [timek,timen,timel,timej,times]

b=(
    Bar()
    .add_xaxis(label)
    .add_yaxis('精度',score_kh)
    .add_yaxis('分类花费时间',time_kh)
    .reversal_axis()
    .set_series_opts(
            label_opts=opts.LabelOpts(position="right"))
    .set_global_opts(
            title_opts=opts.TitleOpts(title="MNIST数据集分类")
)  
)
b.render_notebook()
