In [1]:
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')                                       #載入'MNIST original'手寫字資料集存入變數mnist中

index = np.arange(len(mnist.data))                                           #產生與data數相同長度(0~69999)的陣列
random.shuffle(index)                                                        #將該陣列隨機洗牌，打亂順序
train_index = index[0:60000]                                                 #前60000筆對應的編號作為訓練資料
test_index = index[60000:70000]                                              #後10000筆對應的編號則作為測試資料
X_train, y_train = mnist.data[train_index], mnist.target[train_index]
X_test, y_test = mnist.data[test_index], mnist.target[test_index]

pca = PCA(n_components=30)                                                   #使用PCA套件，設定降至30維
newX_train = pca.fit_transform(X_train)                                      #用訓練資料配適降階用的半正交矩陣並把轉換後結果存至newX_train
newX_test = pca.transform(X_test)                                            #用剛剛找出的矩陣對訓練資料也進行轉換，存入newX_test
newX_train = newX_train/255                                                  #將資料進行歸一化(normalization)
newX_test = newX_test/255                                                    #這邊除255等同於sklearn的MinMaxScaler

clf = SVC()                                                                  #使用SVC(Support Vector Classification)進行辨識
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))   

訓練資料辨識率: 0.9913666666666666
測試資料辨識率: 0.9825


In [3]:
%%timeit -n 1 -r 1
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(60))
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.9906833333333334
測試資料辨識率: 0.9733
48.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [4]:
%%timeit -n 1 -r 1
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='sag', max_iter=1000)
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.8808833333333334
測試資料辨識率: 0.876
13.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [5]:
%%timeit -n 1 -r 1
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))



訓練資料辨識率: 0.8605833333333334
測試資料辨識率: 0.8512
785 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [7]:
%%timeit -n 1 -r 1
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.8581
測試資料辨識率: 0.8568
756 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [8]:
%%timeit -n 1 -r 1
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.9846666666666667
測試資料辨識率: 0.9774
3min 7s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [9]:
%%timeit -n 1 -r 1
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 1.0
測試資料辨識率: 0.8482
6.78 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [10]:
%%timeit -n 1 -r 1
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.9988333333333334
測試資料辨識率: 0.9283
6.39 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [11]:
%%timeit -n 1 -r 1
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.7102666666666667
測試資料辨識率: 0.7068
22.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [12]:
%%timeit -n 1 -r 1
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier()
clf.fit(newX_train, y_train)
print("訓練資料辨識率:",np.mean(clf.predict(newX_train) == y_train))
print("測試資料辨識率:",np.mean(clf.predict(newX_test) == y_test))

訓練資料辨識率: 0.99725
測試資料辨識率: 0.9128
37 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
