In [1]:
from __future__ import print_function

import os
import sys
import random
import tarfile
import scipy.io
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from scipy import ndimage
from six.moves import cPickle as pickle
from IPython.display import display, Image
from sklearn.linear_model import SGDClassifier
from six.moves.urllib.request import urlretrieve
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

In [2]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
    """
    A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 1% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent
        
def maybe_download(filename, force=False):
    """
    Download a file if not present, and make sure it's the right size.
    """
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
        
    statinfo = os.stat(filename)
    return filename

In [3]:
train_filename = maybe_download('train.tar.gz')
test_filename = maybe_download('test.tar.gz')

train_matfile = maybe_download('train_32x32.mat')
test_matfile = maybe_download('test_32x32.mat')
extra_matfile = maybe_download('extra_32x32.mat')

In [None]:
train_data = scipy.io.loadmat('train_32x32.mat', variable_names='X').get('X')
train_labels = scipy.io.loadmat('train_32x32.mat', variable_names='y').get('y')
test_data = scipy.io.loadmat('test_32x32.mat', variable_names='X').get('X')
test_labels = scipy.io.loadmat('test_32x32.mat', variable_names='y').get('y')
extra_data = scipy.io.loadmat('extra_32x32.mat', variable_names='X').get('X')
extra_labels = scipy.io.loadmat('extra_32x32.mat', variable_names='y').get('y')

print(train_data.shape, train_labels.shape)
print(test_data.shape, test_labels.shape)
print(extra_data.shape, extra_labels.shape)

In [None]:
# Replace 10 with 0 for 0 digit
train_labels[train_labels == 10] = 0
test_labels[test_labels == 10] = 0
extra_labels[extra_labels == 10] = 0

In [None]:
# Build Validation Dataset and Labels Based on the Methods in This Paper:
# [https://arxiv.org/pdf/1204.3968.pdf]

random.seed()

# Number of classes
n_labels = 10

valid_index = []
train_index = []
extra_valid_index = []
extra_train_index = []


# Here we collect indicies of each class. 
# The validation set will consis of 2/3(400) entries from training set
# And 1/3(200) entries from extra set, which contains easy samples for each class
for i in np.arange(n_labels):
    valid_index.extend(np.where(train_labels[:,0] == (i)) [0][:400].tolist())
    train_index.extend(np.where(train_labels[:,0] == (i)) [0][400:].tolist())
    
    extra_valid_index.extend(np.where(extra_labels[:,0] == (i))[0][:200].tolist())
    extra_train_index.extend(np.where(extra_labels[:,0] == (i))[0][200:].tolist())

# Since we don't know much about how data is collected we shuffle these indicies.
random.shuffle(valid_index)
random.shuffle(train_index)
random.shuffle(extra_valid_index)
random.shuffle(extra_train_index)

# Now we combine and transpose data from training and extra set.
valid_data = np.concatenate((extra_data[:,:,:,extra_valid_index], train_data[:,:,:,valid_index]), axis=3).transpose((3,0,1,2))
valid_labels = np.concatenate((extra_labels[extra_valid_index,:], train_labels[valid_index,:]), axis=0)[:,0]

train_data_t = np.concatenate((extra_data[:,:,:,extra_train_index], train_data[:,:,:,train_index]), axis=3).transpose((3,0,1,2))
train_labels_t = np.concatenate((extra_labels[extra_train_index,:], train_labels[train_index,:]), axis=0)[:,0]

test_data = test_data.transpose((3,0,1,2))
test_labels = test_labels[:,0]

print(train_data_t.shape, train_labels_t.shape)
print(test_data.shape, test_labels.shape)
print(valid_data.shape, valid_labels.shape)

In [None]:
image_size = 32  # Pixel width and height.
pixel_depth = 255.0  # Number of levels per pixel.

def make_gray(image):
    '''
    Normalize images
    '''
    image = image.astype(float)
    image_gray = np.dot(image, [[0.2989],[0.5870],[0.1140]])
    return image_gray

gray_train_data = make_gray(train_data_t)[:,:,:,0]
gray_test_data = make_gray(test_data)[:,:,:,0]
gray_valid_data = make_gray(valid_data)[:,:,:,0]

print(train_data_c.shape, train_labels_t.shape)
print(test_data_c.shape, test_labels.shape)
print(valid_data_c.shape, valid_labels.shape)

In [None]:
def GCN(image, min_divisor=1e-4):
    """Global Contrast Normalization"""
    
    imsize = image.shape[0]
    mean = np.mean(image, axis=(1,2), dtype=float)
    std = np.std(image, axis=(1,2), dtype=float, ddof=1)
    std[std < min_divisor] = 1.
    image_GCN = np.zeros(image.shape, dtype=float)
    
    for i in np.arange(imsize):
        image_GCN[i,:,:] = (image[i,:,:] - mean[i]) / std[i]
        
    return image_GCN

train_data_GCN = GCN(train_data_c)
test_data_GCN = GCN(test_data_c)
valid_data_GCN = GCN(valid_data_c)

print(train_data_GCN.shape, train_labels_t.shape)
print(test_data_GCN.shape, test_labels.shape)
print(valid_data_GCN.shape, valid_labels.shape)

In [None]:
plt.rcParams['figure.figsize'] = (15.0, 15.0)
f, ax = plt.subplots(nrows=1, ncols=10)

for i, j in enumerate(np.random.randint(0, train_labels_t.shape[0], size=10)):
    ax[i].axis('off')
    ax[i].set_title(train_labels_t[j], loc='center')
    ax[i].imshow(train_data_GCN[j,:,:])

In [None]:
pickle_file = 'SVHN.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        #'train_dataset': train_data_GCN,
        'train_labels': train_labels_t,
        'valid_dataset': valid_data_GCN,
        'valid_labels': valid_labels,
        'test_dataset': test_data_GCN,
        'test_labels': test_labels,
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

In [None]:
pickle_file = 'SVHN1.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset1': train_data_GCN[:200000],
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

In [None]:
pickle_file = 'SVHN2.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset2': train_data_GCN[200000:400000],
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

In [None]:
pickle_file = 'SVHN3.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset3': train_data_GCN[400000:],
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

In [None]:
# Create a Logistic Regression Classifier
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=0.001, shuffle=True, verbose=0,\
                    n_jobs=4, random_state=None, learning_rate='optimal')
clf.fit(train_data_c.reshape(train_data_GCN.shape[0],-1), train_labels_t)
train_prediction = clf.predict(train_data_GCN.reshape(train_data_GCN.shape[0],-1))
valid_prediction = clf.predict(valid_data_GCN.reshape(valid_data_GCN.shape[0],-1))

print('Training score is', clf.score(train_data_GCN.reshape(train_data_GCN.shape[0],-1), train_labels_t))
print('Validation score is', clf.score(valid_data_GCN.reshape(valid_data_GCN.shape[0],-1), valid_labels))

print('Classification report of training data:\n', classification_report(train_labels_t, train_prediction))
print('Confusion Matrix of training data:\n', confusion_matrix(train_labels_t, train_prediction))

print('Classification report of validation data:\n', classification_report(valid_labels, valid_prediction))
print('Confusion Matrix of validation data:\n', confusion_matrix(valid_labels, valid_prediction))