# We classified handwritten numeric data (MNIST) by SVM

Procedure

① Download MNIST data

② Write binary file of MNIST to CSV

③ Write out the CSV data to the image data and check whether the CSV can properly write out

④ Learning and evaluation with SVM

## Execution
① Download MNIST data
Data for train / test has already been divided for MNIST data and it can be downloaded as gz file.

In [1]:
import os
from urllib.request import urlopen

def download(fname):
    # MNIST file from server
    print("%s downloading..." % fname)
    with urlopen("http://yann.lecun.com/exdb/mnist/" + fname) as res:
        d = res.read()
    #Save the data under the mnist folder under the name
        with open("mnist/" + fname, "wb") as f:
            f.write(d)

if __name__ == "__main__":

    if not os.path.exists("mnist"):
        os.mkdir("mnist")

    download("train-images-idx3-ubyte.gz") #traindset
    download("train-labels-idx1-ubyte.gz") #trainlabelset
    download("t10k-images-idx3-ubyte.gz") #testset
    download("t10k-labels-idx1-ubyte.gz") #testlabelset

train-images-idx3-ubyte.gz downloading...
train-labels-idx1-ubyte.gz downloading...
t10k-images-idx3-ubyte.gz downloading...
t10k-labels-idx1-ubyte.gz downloading...


### ② Write binary file of MNIST to CSV

Since the downloaded MNIST data is a binary file of gzip, it makes CSV data so that it can be learned.



In [2]:
import os
import gzip
import struct

def csv_image(fname, type_):
    

    # Read image data from Gzip file

    #Read in binary mode
    with gzip.open(os.path.join("mnist", fname), "rb") as f:
        #">IIII":Read in big endian (order as seen) with packs of 4 bytes each
        #_ :The first 4 bytes are written
        #cnt:The next 4 bytes are the number of images
        #row:The next 4 bytes are the number of rows
        #cols:The next 4 bytes are the number of columns
        _, cnt, rows, cols = struct.unpack(">IIII", f.read(16))
        # Image reading
        images = []
        for i in range(cnt):
            binarys = f.read(rows * cols)
            images.append(",".join([str(b) for b in binarys]))

    # Output as CSV result
    with open(os.path.join("csv", type_ + "_image.csv"), "w") as f:
        f.write("\n".join(images))


def csv_label(fname, type_):
    

    # Read label data from Gzip file
    with gzip.open(os.path.join("mnist", fname), "rb") as f:
        _, cnt = struct.unpack(">II", f.read(8))
        labels = []
        for i in range(cnt):
            label = str(struct.unpack("B", f.read(1))[0])
            labels.append(label)

    # Output as CSV result.
    with open(os.path.join("csv", type_ + "_label.csv"), "w") as f:
        f.write("\n".join(labels))


if __name__ == "__main__":

    if not os.path.exists("csv"):
        os.mkdir("csv")

    # Training data.
    csv_image("train-images-idx3-ubyte.gz", "training")
    csv_label("train-labels-idx1-ubyte.gz", "training")

    # Test data
    csv_image("t10k-images-idx3-ubyte.gz", "test")
    csv_label("t10k-labels-idx1-ubyte.gz", "test")

#### ③Export CSV data to image data and check whether CSV can properly write out.

In [3]:
import os

CNT = 100 #Output image data by 100 characters

if __name__ == "__main__":

    if not os.path.exists("image"):
        os.mkdir("image")

    with open(os.path.join("csv", "training_image.csv")) as f:
        images = f.read().split("\n")

    for i, image in enumerate(images[:CNT]):
        with open(os.path.join("image", "%d.pgm" % i), "w") as fw:
            s = "P2 28 28 255\n" #Write P2 (PGM format symbol), 28 * 28 size, final image number as header
            s += " ".join(image.split(","))
            fw.write(s)

##### Result we obtained could be found in the Project's folder

###### ④ Learning and evaluation with SVM
So we are prepared.

In [4]:
import os
from sklearn import svm, metrics
from sklearn.externals import joblib

# Train size
SIZE_TRAINING = 5000

# Test size
SIZE_TEST = 500

def load_data(type_, size):
    
    with open(os.path.join("csv", "%s_image.csv" % type_)) as f:
        images = f.read().split("\n")[:size]
    with open(os.path.join("csv", "%s_label.csv" % type_)) as f:
        labels = f.read().split("\n")[:size]

             #Divide the number (black and white: white 0 to black 255) 
            #corresponding to each pixel by 256 and convert it to the value 0-1
    images = [[int(i)/256 for i in image.split(",")] for image in images]
    labels = [int(l) for l in labels]

    return images, labels


if __name__ == "__main__":

    #Acquire training data.
    images, labels = load_data("training", SIZE_TRAINING)

    #Learning
    print("Learning start")
    clf = svm.SVC()
    clf.fit(images, labels)

    # Acquire test data
    images, labels = load_data("test", SIZE_TEST)

    # prediction
    print("Start prediction")
    predict = clf.predict(images)

    # Result representation
    print("result")
    ac_score = metrics.accuracy_score(labels, predict)
    cl_report = metrics.classification_report(labels, predict)
    print("Correct answer rate = ", ac_score)
    print(cl_report)

    # Save results
    if not os.path.exists("result"):
        os.mkdir("result")
    joblib.dump(clf, os.path.join("result", "svm.pkl"))



Learning start




Start prediction
result
Correct answer rate =  0.902
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        42
           1       0.94      1.00      0.97        67
           2       0.94      0.87      0.91        55
           3       0.90      0.82      0.86        45
           4       0.88      0.96      0.92        55
           5       0.82      0.92      0.87        50
           6       0.93      0.86      0.89        43
           7       0.86      0.86      0.86        49
           8       0.90      0.88      0.89        40
           9       0.94      0.87      0.90        54

    accuracy                           0.90       500
   macro avg       0.90      0.90      0.90       500
weighted avg       0.90      0.90      0.90       500



###### Discussion
So, our prediction rate is 0.92

But is there a way to improve result?

Of course there is.

We've just used 5000 test points for 500 test ones.

Let's increase the data sets.

In [5]:
import os
from sklearn import svm, metrics
from sklearn.externals import joblib

# Train size
SIZE_TRAINING = 50000

# Test size
SIZE_TEST = 5000

def load_data(type_, size):
    
    with open(os.path.join("csv", "%s_image.csv" % type_)) as f:
        images = f.read().split("\n")[:size]
    with open(os.path.join("csv", "%s_label.csv" % type_)) as f:
        labels = f.read().split("\n")[:size]

             #Divide the number (black and white: white 0 to black 255) 
            #corresponding to each pixel by 256 and convert it to the value 0-1
    images = [[int(i)/256 for i in image.split(",")] for image in images]
    labels = [int(l) for l in labels]

    return images, labels


if __name__ == "__main__":

    #Acquire training data.
    images, labels = load_data("training", SIZE_TRAINING)

    #Learning
    print("Learning start")
    clf = svm.SVC()
    clf.fit(images, labels)

    # Acquire test data
    images, labels = load_data("test", SIZE_TEST)

    # prediction
    print("Start prediction")
    predict = clf.predict(images)

    # Result representation
    print("result")
    ac_score = metrics.accuracy_score(labels, predict)
    cl_report = metrics.classification_report(labels, predict)
    print("Correct answer rate = ", ac_score)
    print(cl_report)

    # Save results
    if not os.path.exists("result"):
        os.mkdir("result")
    joblib.dump(clf, os.path.join("result", "svm.pkl"))

Learning start




Start prediction
result
Correct answer rate =  0.9232
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       460
           1       0.95      0.98      0.97       571
           2       0.93      0.92      0.92       530
           3       0.89      0.92      0.91       500
           4       0.91      0.93      0.92       500
           5       0.90      0.90      0.90       456
           6       0.94      0.94      0.94       462
           7       0.93      0.88      0.90       512
           8       0.93      0.88      0.90       489
           9       0.91      0.89      0.90       520

    accuracy                           0.92      5000
   macro avg       0.92      0.92      0.92      5000
weighted avg       0.92      0.92      0.92      5000

