## I) Reading the data

In [1]:
# Author : Martin Thoma 
# URL : https://martin-thoma.com/classify-mnist-with-pybrain/

from struct import unpack
import gzip
from numpy import zeros, uint8, float32

def get_labeled_data(imagefile, labelfile):
    """Read input-vector (image) and target class (label, 0-9) and return
       it as list of tuples.
    """
    # Open the images with gzip in read binary mode
    images = gzip.open(imagefile, 'rb')
    labels = gzip.open(labelfile, 'rb')

    # Read the binary data

    # We have to get big endian unsigned int. So we need '>I'

    # Get metadata for images
    images.read(4)  # skip the magic_number
    number_of_images = images.read(4)
    number_of_images = unpack('>I', number_of_images)[0]
    rows = images.read(4)
    rows = unpack('>I', rows)[0]
    cols = images.read(4)
    cols = unpack('>I', cols)[0]

    # Get metadata for labels
    labels.read(4)  # skip the magic_number
    N = labels.read(4)
    N = unpack('>I', N)[0]

    if number_of_images != N:
        raise Exception('number of labels did not match the number of images')

    # Get the data
    x = zeros((N, rows, cols), dtype=float32)  # Initialize numpy array
    y = zeros((N, 1), dtype=uint8)  # Initialize numpy array
    for i in range(N):
        if i % 1000 == 0:
            print("i: %i" % i)
        for row in range(rows):
            for col in range(cols):
                tmp_pixel = images.read(1)  # Just a single byte
                tmp_pixel = unpack('>B', tmp_pixel)[0]
                x[i][row][col] = tmp_pixel
        tmp_label = labels.read(1)
        y[i] = unpack('>B', tmp_label)[0]
    return (x, y)

X_train, y_train = get_labeled_data("train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz")
X_test, y_test = get_labeled_data("t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz")



i: 0
i: 1000
i: 2000
i: 3000
i: 4000
i: 5000
i: 6000
i: 7000
i: 8000
i: 9000
i: 10000
i: 11000
i: 12000
i: 13000
i: 14000
i: 15000
i: 16000
i: 17000
i: 18000
i: 19000
i: 20000
i: 21000
i: 22000
i: 23000
i: 24000
i: 25000
i: 26000
i: 27000
i: 28000
i: 29000
i: 30000
i: 31000
i: 32000
i: 33000
i: 34000
i: 35000
i: 36000
i: 37000
i: 38000
i: 39000
i: 40000
i: 41000
i: 42000
i: 43000
i: 44000
i: 45000
i: 46000
i: 47000
i: 48000
i: 49000
i: 50000
i: 51000
i: 52000
i: 53000
i: 54000
i: 55000
i: 56000
i: 57000
i: 58000
i: 59000
i: 0
i: 1000
i: 2000
i: 3000
i: 4000
i: 5000
i: 6000
i: 7000
i: 8000
i: 9000


## II) Preprocessing

In [2]:
import cv2
import numpy as np

def view_image(image, label=""):
    """View a single image."""
    print("Label: %s" % label)
    imshow(image, cmap=cm.gray)
    show()

def resize(images):        
    images_resized = []
    for image in images:
        size = len(image)
    
        x_min = 0
        f_break = False
        for x in range(0, size):
            for y in range(0, size):
                if image[y][x]!=0.0:
                    x_min = x   
                    f_break = True
                    break
            if f_break:
                break

        x_max = 28
        f_break = False
        for x in range(size-1,-1,-1):
            for y in range(0, size):
                if image[y][x]!=0.0:
                    x_max = x   
                    f_break = True
                    break
            if f_break:
                break

        y_min = 0
        f_break = False
        for y in range(0, size):
            for x in range(0, size):
                if image[y][x]!=0.0:
                    y_min = y    
                    f_break = True
                    break
            if f_break:
                break

        y_max = 28
        f_break = False
        for y in range(size-1,-1,-1):
            for x in range(0, size):
                if image[y][x]!=0.0:
                    y_max = y    
                    f_break = True
                    break
            if f_break:
                break
                
        temp_image = image[y_min: y_max+1]
        new_image =  []  
        for i in range(0, len(temp_image)):
             new_image.append(temp_image[i][x_min: x_max])
        final_image = cv2.resize(np.asarray(new_image),(20,20))
        images_resized.append(final_image)
        #view_image(final_image, label="")
    return images_resized

# Threshold the data
def thresholding(X):
    for x in X:
        for i in range(len(x)):
            for j in range(len(x[i])):
                if x[i, j] > 127:
                    x[i, j] = 1.
                else:
                     x[i, j] = 0

# Flatten the 28 x 28 matrix corresponding to an image to a 1 x 784 vector 
def flatten(X):
    tmp = []
    for x in range(len(X)):
        tmp.append([X[x][i, j] for i in range(X[x].shape[0]) for j in range(X[x].shape[1])])
    return tmp    

#print X_train[0]

# Resized versions of the images
print "Resizing..."
X_train_resized = resize(X_train)
X_test_resized = resize(X_test)

# Threshold the untouched and resized images
print "Thresholding..."
thresholding(X_train)
thresholding(X_test)
thresholding(X_train_resized)
thresholding(X_test_resized)

# Flatten the images into 1 x 784 vectors
print "Flattenning..."
X_train_preprocessed = flatten(X_train)
X_test_preprocessed = flatten(X_test)
X_train_preprocessed_resized = flatten(X_train_resized)
X_test_preprocessed_resized = flatten(X_test_resized)
print "Preprocessing finished"
#print X_train_preprocessed[0]

Resizing...
Thresholding...


KeyboardInterrupt: 

## III) Random forest classifier

### 10 trees + 4 depth

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_1 = RandomForestClassifier(n_estimators=10, max_depth=4)

def compute_accuracy(random_forest, X, y):
    accuracy = 0
    for i, x in X.iterrows():
        if random_forest.predict(x) == y[i]:
            accuracy += 1
    return accuracy

print "10 trees + 4 depth"
# Untouched images
random_forest_1.fit(X_train_preprocessed, y_train)
accuracy_1 = compute_accuracy(random_forest_1, X_test_preprocessed, y_test)
print "untouched images accuracy " + str(accuracy_1)

# Resized images
random_forest_1.fit(X_train_preprocessed_resized, y_train)
accuracy_2 = compute_accuracy(random_forest_1, X_test_preprocessed_resized, y_test)
print "streched images accuracy " + str(accuracy_2)

### 10 trees + 16 depth

In [None]:
random_forest_2 = RandomForestClassifier(n_estimators=10, max_depth=16)

def compute_accuracy(random_forest, X, y):
    accuracy = 0
    for i, x in X.iterrows():
        if random_forest.predict(x) == y[i]:
            accuracy += 1
    return accuracy

print "10 trees + 16 depth"
# Untouched images
random_forest_2.fit(X_train_preprocessed, y_train)
accuracy_1 = compute_accuracy(random_forest_2, X_test_preprocessed, y_test)
print "untouched images accuracy " + str(accuracy_1)

# Resized images
random_forest_2.fit(X_train_preprocessed_resized, y_train)
accuracy_2 = compute_accuracy(random_forest_2, X_test_preprocessed_resized, y_test)
print "streched images accuracy " + str(accuracy_2)

### 30 trees + 4 depth 

In [None]:
random_forest_3 = RandomForestClassifier(n_estimators=30, max_depth=4)

def compute_accuracy(random_forest, X, y):
    accuracy = 0
    for i, x in X.iterrows():
        if random_forest.predict(x) == y[i]:
            accuracy += 1
    return accuracy

print "30 trees + 4 depth"
# Untouched images
random_forest_3.fit(X_train_preprocessed, y_train)
accuracy_1 = compute_accuracy(random_forest_3, X_test_preprocessed, y_test)
print "untouched images accuracy " + str(accuracy_1)

# Resized images
random_forest_3.fit(X_train_preprocessed_resized, y_train)
accuracy_2 = compute_accuracy(random_forest_3, X_test_preprocessed_resized, y_test)
print "streched images accuracy " + str(accuracy_2)

### 30 trees + 16 depth 

In [None]:
random_forest_4 = RandomForestClassifier(n_estimators=30, max_depth=16)

def compute_accuracy(random_forest, X, y):
    accuracy = 0
    for i, x in X.iterrows():
        if random_forest.predict(x) == y[i]:
            accuracy += 1
    return accuracy

print "30 trees + 16 depth"
# Untouched images
random_forest_4.fit(X_train_preprocessed, y_train)
accuracy_1 = compute_accuracy(random_forest_4, X_test_preprocessed, y_test)
print "untouched images accuracy " + str(accuracy_1)

# Resized images
random_forest_4.fit(X_train_preprocessed_resized, y_train)
accuracy_2 = compute_accuracy(random_forest_4, X_test_preprocessed_resized, y_test)
print "streched images accuracy " + str(accuracy_2)