In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import h5py
import os
from six.moves import cPickle as pickle

## Load Mat
Firstly we have to load the digitStruct.mat and prepare it for the analysis.

In [3]:
train_digitStruct = h5py.File('train/digitStruct.mat')
print(train_digitStruct['digitStruct']['bbox'].shape)

(33402, 1)


The file has 33402 entries. In the next step, I will create some helper methods to read the attributes of every images like name, label, top, left, width and height.

In [4]:
#Helper Methods to read attributes of digitStruct.mat
def getSingleAttribute(struct, index, attr):
    attribute = struct[struct['digitStruct']['bbox'][index][0]][attr].value.squeeze()
    if attribute.dtype == 'float64':
        return attribute.reshape(-1)
    else:
        return np.array([struct[x].value for x in attribute]).squeeze()

def getName(struct, index):
    return struct[struct['digitStruct']['name'][index][0]].value.tostring()

def getAttributes(struct, index):
    image = getName(struct, index)
    label = getSingleAttribute(struct, index, 'label')
    for i in range(len(label)):
        if label[i] == 10:
            label[i] =0
    top = getSingleAttribute(struct, index, 'top')
    left = getSingleAttribute(struct, index, 'left')
    width = getSingleAttribute(struct, index, 'width')
    height = getSingleAttribute(struct, index, 'height')
    size = len(top)
    return image, label, top, left, width, height, size



Before I extract the features of the dataset, I need to know the max size of each attribute in order to prepare the numpy arrays.

In [5]:
label_minsize = 0

for i in range(33402):
    label_length = len(getSingleAttribute(train_digitStruct, i, 'label'))
    if label_length > label_minsize:
        label_minsize = label_length
   
print(label_minsize)

KeyboardInterrupt: 

Through this analysis we know that we the max array size of each feature is 6.

In [6]:
import time

def openFile(path):
    return h5py.File(path)

def initDataStructures(size):
    images = np.zeros(shape=size, dtype=str)
    labels = np.zeros(shape=(size, 6), dtype=float)
    labels.fill(10)
    tops = np.zeros(shape=(size, 6), dtype=float)
    lefts = np.zeros(shape=(size, 6), dtype=float)
    widths = np.zeros(shape=(size, 6), dtype=float)
    heights = np.zeros(shape=(size, 6), dtype=float)
    sizes = np.empty(shape=size, dtype=int)
    return images, labels, tops, lefts, widths, heights, sizes

def processing(indices, digitStruct, start=0):
    images, labels, tops, lefts, widths, heights, sizes = initDataStructures(indices)
      
    for index in range(indices):
        image, label, top, left, width, height, size = getAttributes(digitStruct, index+start)
        images[index] = image
        labels[index][:label.shape[0]] = label
        tops[index][:top.shape[0]] = top
        lefts[index][:left.shape[0]] = left
        widths[index][:width.shape[0]] = width
        heights[index][:height.shape[0]] = height
        sizes[index] = size        
        
    return images, labels, tops, lefts, widths, heights, sizes

def processData(path, validSize=0, predictSize=0, extra=False):
    digitStruct = openFile(path)
       
    if extra == True:
        validData = processing(validSize, digitStruct)
        predictData = processing(predictSize, digitStruct, start=validSize)
        return validData, predictData
    else:
        indices = digitStruct['digitStruct']['bbox'].shape[0]
        images, labels, tops, lefts, widths, heights, sizes = processing(indices, digitStruct)
        return images, labels, tops, lefts, widths, heights, sizes

In [None]:
trainData = processData('train/digitStruct.mat')
testData = processData('test/digitStruct.mat')
validData, predictData = processData('extra/digitStruct.mat', validSize=15000, predictSize=5000, extra=True)

In [None]:
def createFeaturesDict(dataset):
    return {
        'images': dataset[0],
        'labels': dataset[1],
        'tops': dataset[2],
        'lefts': dataset[3],
        'widths': dataset[4],
        'heights': dataset[5],
        'sizes': dataset[6]
    }

def maybePickle(train, test, valid, predict):
    pickleFile = 'svhn.pickle'
    if os.path.exists(pickleFile):
        print('The file ' + pickleFile + ' is already present.')
    else:
        print('The file ' + pickleFile + ' will be created.')
        data = {
            'train': createFeaturesDict(train), 
            'valid': createFeaturesDict(valid),
            'test': createFeaturesDict(test),
            'predict' : createFeaturesDict(predict)
        }
        try:
            with open(pickleFile, 'wb') as f:
              pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', pickleFile, ':', e)

In [8]:
maybePickle(trainData, testData, validData, predictData)

The file svhn.pickle will be created.
