# Preparation
In this notebook the necessary data preparation for further steps will be done. This means to extract the attributes of the digitStruct.mat file and save them as pickle file for further processing.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as sio
import h5py
import os
from six.moves import cPickle as pickle

In [2]:
train_digitStruct = h5py.File('train/digitStruct.mat')

(33402, 1)


In [1]:
"""
Different helper methods to extract the information of the attributes 
from the digitStruct.mat file
"""

# This method return the value of a certain attribute from the digitStruct
def getSingleAttribute(struct, index, attr):
    attribute = struct[struct['digitStruct']['bbox'][index][0]][attr].value.squeeze()
    if attribute.dtype == 'float64':
        return attribute.reshape(-1)
    else:
        return np.array([struct[x].value for x in attribute]).squeeze()

# This method returns the name of a image. Also it replaces a certain value with an empty value.
def getName(struct, index):
    return struct[struct['digitStruct']['name'][index][0]].value.tostring().replace("\x00", "")

# This method returns all the attributes of a certain index from the digitStruct.
def getAttributes(struct, index):
    image = getName(struct, index)
    label = getSingleAttribute(struct, index, 'label')
    for i in range(len(label)):
        if label[i] == 10:
            label[i] =0
    top = getSingleAttribute(struct, index, 'top')
    left = getSingleAttribute(struct, index, 'left')
    width = getSingleAttribute(struct, index, 'width')
    height = getSingleAttribute(struct, index, 'height')
    size = len(top)
    return image, label, top, left, width, height, size



In [5]:
# Method to open the .mat file
def openFile(path):
    return h5py.File(path)

# Initialiser of the data structures for storing the values
def initDataStructures(size):
    images = []
    labels = np.zeros(shape=(size, 6), dtype=float)
    labels.fill(10)
    tops = np.zeros(shape=(size, 6), dtype=float)
    lefts = np.zeros(shape=(size, 6), dtype=float)
    widths = np.zeros(shape=(size, 6), dtype=float)
    heights = np.zeros(shape=(size, 6), dtype=float)
    sizes = np.empty(shape=size, dtype=int)
    return images, labels, tops, lefts, widths, heights, sizes

# This method iterates over the indices of the digitStruct and return the values of the attributes
def processing(indices, digitStruct, start=0):
    images, labels, tops, lefts, widths, heights, sizes = initDataStructures(indices)
      
    for index in range(indices):
        updatedIndex = index+start
        image, label, top, left, width, height, size = getAttributes(digitStruct, updatedIndex)
        images.append(image)
        labels[index][:label.shape[0]] = label
        tops[index][:top.shape[0]] = top
        lefts[index][:left.shape[0]] = left
        widths[index][:width.shape[0]] = width
        heights[index][:height.shape[0]] = height
        sizes[index] = size        
        
    return images, labels, tops, lefts, widths, heights, sizes

# This method starts the extracting of the attributes and it creates the valid and predict dataset with the correct size.
def processData(path, validSize=0, predictSize=0, extra=False):
    digitStruct = openFile(path)
       
    if extra == True:
        predictData = processing(predictSize, digitStruct, start=validSize)
        validData = processing(validSize, digitStruct)
        return validData, predictData
    else:
        indices = digitStruct['digitStruct']['bbox'].shape[0]
        images, labels, tops, lefts, widths, heights, sizes = processing(indices, digitStruct)
        return images, labels, tops, lefts, widths, heights, sizes

In [6]:
trainData = processData('train/digitStruct.mat')
testData = processData('test/digitStruct.mat')
validData, predictData = processData('extra/digitStruct.mat', validSize=15000, predictSize=5000, extra=True)

In [7]:
# saves the features as dictionaries
def createFeaturesDict(dataset):
    return {
        'images': dataset[0],
        'labels': dataset[1],
        'tops': dataset[2],
        'lefts': dataset[3],
        'widths': dataset[4],
        'heights': dataset[5],
        'sizes': dataset[6]
    }

# The extracted attributes will be saved as pickle files.
def maybePickle(train, test, valid, predict):
    pickleFile = 'svhn.pickle'
    if os.path.exists(pickleFile):
        print('The file ' + pickleFile + ' is already present.')
    else:
        
        data = {
            'train': createFeaturesDict(train), 
            'valid': createFeaturesDict(valid),
            'test': createFeaturesDict(test),
            'predict' : createFeaturesDict(predict)
        }
        try:
            with open(pickleFile, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
                print('The file ' + pickleFile + ' is created.')
        except Exception as e:
            print('Unable to save data to', pickleFile, ':', e)

In [8]:
maybePickle(trainData, testData, validData, predictData)

The file svhn.pickle will be created.
