# Extracting HARR features from MNIST image dataset 

### Implement and run HAAR feature Extraction for each image on the Digit Dataset. Then repeat the classification task with the extracted features. 

In [166]:
import numpy as np
from sklearn.model_selection import train_test_split
from random import *
from sklearn.linear_model import LogisticRegression

In [167]:
# Fetch dataset
from sklearn.datasets import fetch_mldata
mnist_data = fetch_mldata('MNIST original')

In [168]:
def preprocess_harr(data, r, c):
    rect = np.zeros((r+1,c+1), dtype=int)
    for i in range(1, r+1):
        for j in range(1, c+1):
            rect[i, j] = rect[i, j-1] + rect[i-1, j] - rect[i-1, j-1]
            if data[i-1, j-1] != 0:
                rect[i, j] += 1
    return rect.flatten()

In [169]:
# Write a function that keeps and stores preprocessed values of black pixels for every image in dataset
def run_black_matrix(data):
    data_row, data_col = data.shape
    print (data.shape)
    black_count_matrix = np.zeros(shape = (data_row, 841))
    for c, row in enumerate(data):
        black_count_matrix[c,:] = preprocess_harr(row.reshape(28, 28), 28, 28)
        
    return black_count_matrix

In [170]:
mnist_train, mnist_test, mnist_train_l, mnist_test_l = train_test_split(mnist_data.data, mnist_data.target, test_size = 0.2, random_state = 42)

In [171]:
train_black_matrix = run_black_matrix(mnist_train)
test_black_matrix = run_black_matrix(mnist_test)

(56000, 784)
(14000, 784)


In [172]:
def gen_rects():
    all_rects = []
    while len(all_rects) < 100:
        x = randint(1, 28)
        y = randint(1, 28)
        l = randint(5, 28)
        b = randint(5, 28)

        if x+b > 28 or y+l > 28 or l*b < 130 or l*b > 170:
            continue
        else:
            all_rects.append((x,y,l,b))
    return all_rects

In [174]:
random_rects = gen_rects()

In [175]:
def get_haar(black_matrix, rectangles):
    haar_features = np.zeros((black_matrix.shape[0], 200))
    for c, digit in enumerate(black_matrix):
        digit_mat_blk = digit.reshape(29, 29)
        feat_num = 0
        for rectangle in rectangles:
            x, y, l, b = rectangle
            top_blk = digit_mat_blk[x+round(b/2), y+l] - digit_mat_blk[x+round(b/2), y] - digit_mat_blk[x,y+l] + digit_mat_blk[x, y]
            bot_blk = digit_mat_blk[x+b, y+l] - digit_mat_blk[x+b, y] - digit_mat_blk[x+round(b/2),y+l] + digit_mat_blk[x+round(b/2), y]

            haar_features[c, feat_num] = top_blk - bot_blk
            feat_num += 1

            left = digit_mat_blk[x+b, y+round(l/2)] - digit_mat_blk[x + b, y] - digit_mat_blk[x, y+round(l/2)] + digit_mat_blk[x,y]
            right = digit_mat_blk[x +b, y+l] - digit_mat_blk[x+b,y+round(l/2)] - digit_mat_blk[x,y+l] + digit_mat_blk[x, y+round(l/2)]

            haar_features[c, feat_num] = left - right
            feat_num += 1
    return haar_features

In [176]:
train_haar_features = get_haar(train_black_matrix, random_rects)
test_haar_features = get_haar(test_black_matrix, random_rects)

In [177]:
mnist_logreg = LogisticRegression(multi_class='multinomial', penalty = 'l2', solver = 'saga', tol = 0.1)   
mnist_logreg.fit(train_haar_features, mnist_train_l)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='saga',
          tol=0.1, verbose=0, warm_start=False)

In [178]:
mnist_logreg.score(train_haar_features, mnist_train_l)

0.9056607142857143

In [179]:
mnist_logreg.score(test_haar_features, mnist_test_l)

0.9078571428571428