# Preprocessing
This notebook processes the extracted data for the training of the model. In order to create the training data for the model the images will be resized without losing the important information. 

In [1]:
import numpy as np
from six.moves import cPickle as pickle
from scipy import ndimage
from scipy.misc import imresize, imsave
from PIL import Image

# Definitions of the outcome image size
pixels_width = 32
pixels_height = 32
# Pixels
pixels_depth = 255.0

In [2]:
# The stored pickle file is restored.
pickleFile = open('svhn.pickle', 'r')
data = pickle.load(pickleFile)
training = data['train']
testing = data['test']
valid = data['valid']
predict = data['predict']

In [4]:
# Different helper methods to extract the values from the bounding box
"""
This method extracts the zero values of the attributes. 
It is very important to remoe the zero values, because otherwise,
not the correct values of the bounding boxes will be picked.
"""
def removeZeros(arr):
    newarr = []
    for index in range(len(arr)):
        if arr[index] > 0:
            newarr.append(arr[index])
    return newarr
  

# This method returns the lowest coordinates of the left bound
def getLeftBound(lefts):
    if len(lefts) <= 0:
        return 0
    return int(min(lefts))

# This method returns the lowest coordinates of the top bound
def getTopBound(tops):
    if len(tops) <= 0:
        return 0
    return int(min(tops))

# This method returns the coordinates of bottom bound
def getBottomBound(top, heights):
    return int(max(heights) + top)


# This method returns the correct bound of the right bound.
def getRightBound(lefts, widths):
    if len(lefts) == 0:
        left = 0
    else:
        left = max(lefts)
    
    if len(widths) == 0:
        width = 0
    else:
        width = np.sum(np.fabs(widths))
        
    return int(left + width)

In [5]:
"""
This method is very important for the processing.
Firstly the correct bounds are extracted from the images.
Then the bounds are used to remove not important information of the image.
In the next step, the remaining pixels the image will be resized to 32x32 pixels.
In the last step, regularization will take place. This will prevent to outcoming model from overfitting.
"""

def manipulateImage(path, dataset, index):
    name = dataset['images'][index]
    imageFile = path + name
    imageData = np.average(ndimage.imread(imageFile),axis=2)
    lefts = removeZeros(dataset['lefts'][index])
    widths = removeZeros(dataset['widths'][index])
    tops = removeZeros(dataset['tops'][index])
    heights = removeZeros(dataset['heights'][index])
    
    left = getLeftBound(lefts)
    right = getRightBound(lefts, widths)
    top = getTopBound(tops)
    if top < 0: top = 0
    bottom = getBottomBound(top, heights)
    
    cutImage = imageData[top:bottom, left:right]
    if cutImage.size == 0:
        cutImage = imageData
    resizedData = imresize(cutImage, (pixels_height, pixels_width))
    resizedData = (resizedData.astype(float) - pixels_depth / 2) / pixels_depth
    return resizedData


# This method creates the processed data and calls the method above.
def generateNewImages(path, dataset):
    indices = len(dataset['images'])
    generatedData = np.ndarray(shape=(indices, pixels_height, pixels_width), dtype=np.float32)
    counter = 0
    for index in range(indices):
        image = manipulateImage(path, dataset, index)
        generatedData[index, :, :] = image

    return generatedData

In [6]:
training_X = generateNewImages("train/", training)
testing_X = generateNewImages("test/", testing)
valid_X = generateNewImages("extra/", valid)
predict_X = generateNewImages("extra/", predict)

In [8]:
training_Y = training['labels']
testing_Y = testing['labels']
valid_Y = valid['labels']
predict_Y = predict['labels']

In [10]:
# In the last code section the processed information will be saved into a pickle file.

def createDataLabelDict(X,Y):
    return {
        'data': X,
        'label': Y
    }

tensorflow_file = 'tensorflow_data.pickle'
try:
    with open(tensorflow_file, 'wb') as f:
        dump = {
            'train': createDataLabelDict(training_X, training_Y),
            'test': createDataLabelDict(testing_X, testing_Y),
            'valid': createDataLabelDict(valid_X, valid_Y),
            'predict': createDataLabelDict(predict_X, predict_Y)
        }
        pickle.dump(dump, f, pickle.HIGHEST_PROTOCOL)
except Exception as e:
    print('Unable to save data to', tensorflow_file, ':', e)