In [1]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
import tensorflow as tf
from IPython.display import Image
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
%matplotlib inline

In [2]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None


def download_progress_hook(count, blockSize, totalSize):
    """
    A hook to report the progress of a download. This is mostly intended for users with
    slow internet connections. Reports every 1% change in download progress.
    """
    global last_percent_reported
    percent = int(count * blockSize * 100 / totalSize)

    if last_percent_reported != percent:
        if percent % 5 == 0:
            sys.stdout.write("%s%%" % percent)
            sys.stdout.flush()
        else:
            sys.stdout.write(".")
            sys.stdout.flush()

        last_percent_reported = percent
        
        
def maybe_download(filename, force=False):
    """
    Download a file if not present, and make sure it's the right size.
    """
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename) 
        filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    else:
        print(filename, 'is already downloaded. Skipped.')
    return filename

In [3]:
train_filename = maybe_download('train.tar.gz')

train.tar.gz is already downloaded. Skipped.


In [4]:
test_filename = maybe_download('test.tar.gz')

test.tar.gz is already downloaded. Skipped.


In [5]:
extra_filename = maybe_download('extra.tar.gz')

extra.tar.gz is already downloaded. Skipped.


In [6]:
np.random.seed(133)


def maybe_extract(file_, force=False):
    filename = os.path.splitext(os.path.splitext(file_)[0])[0]  # remove .tar.gz
    
    if os.path.isdir(filename) and not force:
        # You may override by setting force=True.
        print('%s is already presented - Skipping extraction of %s.' % (filename, file_))
    else:
        print('Extracting %s file data. Please wait...' % file_)
        tar = tarfile.open(file_)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
        print('File %s is successfully extracted into %s directory.' % (file_, filename))        
    
    return filename

In [7]:
# Variables contain directory names where data is extracted
train_folders = maybe_extract(train_filename)

train is already presented - Skipping extraction of train.tar.gz.


In [8]:
test_folders = maybe_extract(test_filename)

test is already presented - Skipping extraction of test.tar.gz.


In [9]:
extra_folders = maybe_extract(extra_filename)

extra is already presented - Skipping extraction of extra.tar.gz.


In [10]:
def remove_anomaly_samples(data, max_class_length = 5):
    """
    Here we remove all data which has class length higher than specified value.
    """
    print("\nDataset size before update:", len(data))
    
    for i in range(len(data)):
        if i < len(data) and len(data[i]['label']) > max_class_length:
            print("\nAnomaly at index %d detected. Class size: %d" % (i, len(data[i]['label'])))
            del data[i]
            
    print("\nDataset after before update:", len(data))            
    return data

In [11]:
import h5py

# The DigitStructFile is just a wrapper around the h5py data.  It basically references 
#     file_:            The input h5 matlab file
#     digitStructName   The h5 ref to all the file names
#     digitStructBbox   The h5 ref to all struc data
class DigitStructsWrapper:
    def __init__(self, file_):
        self.file_ = h5py.File(file_, 'r')
        self.names = self.file_['digitStruct']['name']
        self.bboxes = self.file_['digitStruct']['bbox']
        self.collectionSize = len(self.names)
        print("\n%s file structure contain %d entries" % (file_, self.collectionSize))
        
        
    def bboxHelper(self, keys_):
        """
        Method handles the coding difference when there is exactly one bbox or an array of bbox. 
        """
        if (len(keys_) > 1):
            val = [self.file_[keys_.value[j].item()].value[0][0] for j in range(len(keys_))]
        else:
            val = [keys_.value[0][0]]
        return val

    
    # getBbox returns a dict of data for the n(th) bbox. 
    def getBbox(self, n):
        bbox = {}
        bb = self.bboxes[n].item()
        bbox['height'] = self.bboxHelper(self.file_[bb]["height"])
        bbox['left'] = self.bboxHelper(self.file_[bb]["left"])
        bbox['top'] = self.bboxHelper(self.file_[bb]["top"])
        bbox['width'] = self.bboxHelper(self.file_[bb]["width"])
        bbox['label'] = self.bboxHelper(self.file_[bb]["label"])
        return bbox

    
    def getName(self, n):
        """
        Method returns the filename for the n(th) digitStruct. Since each letter is stored in a structure 
        as array of ANSII char numbers we should convert it back by calling chr function.
        """
        return ''.join([chr(c[0]) for c in self.file_[self.names[n][0]].value])

    
    def getNumberStructure(self,n):
        s = self.getBbox(n)
        s['name']=self.getName(n)
        return s

    def getAllNumbersStructure(self):
        """
        Method returns an array, which contains information about every image.
        This info contains: positions, labels 
        """
        return [self.getNumberStructure(i) for i in range(self.collectionSize)]

    
    # Return a restructured version of the dataset (one object per digit in 'boxes').
    #
    #   Return a list of dicts :
    #      'filename' : filename of the samples
    #      'boxes' : list of dicts (one by digit) :
    #          'label' : 1 to 9 corresponding digits. 10 for digit '0' in image.
    #          'left', 'top' : position of bounding box
    #          'width', 'height' : dimension of bounding box
    #
    # Note: We may turn this to a generator, if memory issues arise.
    def getAllNumbersRestructured(self): # getAllDigitStructure_ByDigit
        numbersData = self.getAllNumbersStructure()
        print("\nObject structure before transforming: ", numbersData[0])
        remove_anomaly_samples(numbersData)
        
        result = []
        for numData in numbersData:
            metadatas = []
            for i in range(len(numData['height'])):
                metadata = {}
                metadata['height'] = numData['height'][i]
                metadata['label']  = numData['label'][i]
                metadata['left']   = numData['left'][i]
                metadata['top']    = numData['top'][i]
                metadata['width']  = numData['width'][i]
                metadatas.append(metadata)
                
            result.append({ 'boxes':metadatas, 'name':numData["name"] })
            
        print("\nObject structure after transforming: ", result[0])
        
        return result

In [12]:
train_folders = 'train'

file_ = os.path.join(train_folders, 'digitStruct.mat')
dsf = DigitStructsWrapper(file_)
train_data = dsf.getAllNumbersRestructured()


train/digitStruct.mat file structure contain 33402 entries

Object structure before transforming:  {'name': '1.png', 'top': [77.0, 81.0], 'label': [1.0, 9.0], 'width': [81.0, 96.0], 'height': [219.0, 219.0], 'left': [246.0, 323.0]}

Dataset size before update: 33402

Anomaly at index 29929 detected. Class size: 6

Dataset after before update: 33401

Object structure after transforming:  {'boxes': [{'width': 81.0, 'top': 77.0, 'label': 1.0, 'left': 246.0, 'height': 219.0}, {'width': 96.0, 'top': 81.0, 'label': 9.0, 'left': 323.0, 'height': 219.0}], 'name': '1.png'}


In [13]:
test_folders = 'test'

file_ = os.path.join(test_folders, 'digitStruct.mat')
dsf = DigitStructsWrapper(file_)
test_data = dsf.getAllNumbersRestructured()


test/digitStruct.mat file structure contain 13068 entries

Object structure before transforming:  {'name': '1.png', 'top': [7.0], 'label': [5.0], 'width': [19.0], 'height': [30.0], 'left': [43.0]}

Dataset size before update: 13068

Dataset after before update: 13068

Object structure after transforming:  {'boxes': [{'width': 19.0, 'top': 7.0, 'label': 5.0, 'left': 43.0, 'height': 30.0}], 'name': '1.png'}


In [14]:
from PIL import Image
train_imgSize = np.ndarray([len(train_data),2])

for i in np.arange(len(train_data)):
    filename = train_data[i]['name']
    filepath = os.path.join(train_folders, filename)
    train_imgSize[i, :] = Image.open(filepath).size[:]

train_max_width = np.amax(train_imgSize[:,0])
train_max_height = np.amax(train_imgSize[:,1])
print(train_max_width, train_max_height)

train_min_width = np.amin(train_imgSize[:,0])    
train_min_height = np.amin(train_imgSize[:,1])    
print(train_min_width, train_min_height)

876.0 501.0
25.0 12.0


In [15]:
test_imgSize = np.ndarray([len(test_data),2])

for i in np.arange(len(test_data)):
    filename = test_data[i]['name']
    filepath = os.path.join(test_folders, filename)
    test_imgSize[i, :] = Image.open(filepath).size[:]

test_max_width = np.amax(test_imgSize[:,0])
test_max_height = np.amax(test_imgSize[:,1])
print(test_max_width, test_max_height)

test_min_width = np.amin(test_imgSize[:,0])    
test_min_height = np.amin(test_imgSize[:,1])    
print(test_min_width, test_min_height)

1083.0 516.0
31.0 13.0


In [16]:
print(np.where(train_imgSize[:,0]==train_max_width))
print(np.where(train_imgSize[:,0]==train_min_width))
print(np.where(test_imgSize[:,0]==test_max_width))
print(np.where(test_imgSize[:,0]==test_min_width))

(array([  410,  4163, 15855, 30483]),)
(array([9747]),)
(array([ 1722,  2949,  6233, 12862]),)
(array([  459,  5352,  7776, 11257, 12191]),)


In [21]:
img_size = 32

def prepare_images(samples, folder):
    print("Started preparing images for convnet...")
    
    prepared_images = np.ndarray([len(samples),img_size,img_size,1], dtype='float32')
    actual_numbers = np.ones([len(samples),6], dtype=int) * 0
    
    for i in range(len(samples)):
        filename = samples[i]['name']
        filepath = os.path.join(folder, filename)
        image = Image.open(filepath)
        boxes = samples[i]['boxes']
        number_length = len(boxes)
        
        # at 0 index we store length of a label. 3 -> 1; 123-> 3, 12543 -> 5
        actual_numbers[i,0] = number_length
        
        top = np.ndarray([number_length], dtype='float32')
        left = np.ndarray([number_length], dtype='float32')
        height = np.ndarray([number_length], dtype='float32')
        width = np.ndarray([number_length], dtype='float32')
        
        for j in range(number_length):
            # here we use j+1 since first entry used by label length
            actual_numbers[i,j+1] = boxes[j]['label']
            if boxes[j]['label'] == 10: # Replacing 10 with 0
                actual_numbers[i,j+1] = 0
                
            top[j] = boxes[j]['top']
            left[j] = boxes[j]['left']
            height[j] = boxes[j]['height']
            width[j] = boxes[j]['width']
        
        img_min_top = np.amin(top)
        img_min_left = np.amin(left)
        img_height = np.amax(top) + height[np.argmax(top)] - img_min_top
        img_width = np.amax(left) + width[np.argmax(left)] - img_min_left

        img_left = np.floor(img_min_left - 0.1 * img_width)
        img_top = np.floor(img_min_top - 0.1 * img_height)
        img_right = np.amin([np.ceil(img_left + 1.2 * img_width), image.size[0]])
        img_bottom = np.amin([np.ceil(img_top + 1.2 * img_height), image.size[1]])
            
        image = image.crop((img_left, img_top, img_right, img_bottom)).resize([img_size, img_size], Image.ANTIALIAS) # Resize image to 32x32
        image = np.dot(np.array(image, dtype='float32'), [[0.2989],[0.5870],[0.1140]]) # Convert image to the grayscale
        if i == 0:
            print(image.shape)
        mean = np.mean(image, dtype='float32')
        std = np.std(image, dtype='float32', ddof=1)
        if std < 0.0001: 
            std = 1.0
        image = (image - mean) / std
        prepared_images[i,:,:] = image[:,:,:]
        
    print("Completed. Images cropped, resized and grayscaled")
    
    return prepared_images, actual_numbers

In [22]:
train_dataset, train_labels = prepare_images(train_data, train_folders)
print(train_dataset.shape)
print(train_labels.shape)

Started preparing images for convnet...
(32, 32, 1)
Completed. Images cropped, resized and grayscaled
(33401, 32, 32, 1)
(33401, 6)


In [23]:
test_dataset, test_labels = prepare_images(test_data, test_folders)
print(test_dataset.shape, test_labels.shape)

Started preparing images for convnet...
(32, 32, 1)
Completed. Images cropped, resized and grayscaled
(13068, 32, 32, 1) (13068, 6)


In [24]:
from sklearn.utils import shuffle

train_dataset, train_labels = shuffle(train_dataset, train_labels)
test_dataset, test_labels = shuffle(test_dataset, test_labels)

In [25]:
pickle_file = 'SVHN_multi.pickle'

try:
    f = open(pickle_file, 'wb')
    save = {
        'train_dataset': train_dataset,
        'train_labels': train_labels,
        'test_dataset': test_dataset,
        'test_labels': test_labels,
        }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise
    
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

Compressed pickle size: 192567884
