In [31]:
from __future__ import print_function
import numpy as np
import tensorflow as tf
import os
import sys
from six.moves.urllib.request import urlretrieve
from shutil import copyfile
import IPython.core.debugger
import tarfile
import h5py
import random
from PIL import Image
from six.moves import cPickle as pickle

In [2]:
dbg = IPython.core.debugger.Pdb()

In [4]:
url = 'http://ufldl.stanford.edu/housenumbers/'
TRAIN = 'train.tar.gz'
TEST = 'test.tar.gz'
EXTRA = 'extra.tar.gz'
def maybe_download(filename, work_dir):
    """Download the data from source_url, unless it's already here
    
        Args:
            filename: string, name of the file in the directory.
            work_dir: string, path to working directory.
        Returns:
            Path to resulting file.
            
    """
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    file_path = os.path.join(work_dir,filename)
    if not os.path.exists(file_path):
        temp_file, _ = urlretrieve(url+filename)
        copyfile(temp_file, file_path)

    print('Found and verified:', filename, os.stat(file_path).st_size, 'bytes')
    return file_path


In [7]:
work_dir = 'SVHN/'
train_path = maybe_download(TRAIN,work_dir)
test_path = maybe_download(TEST,work_dir)
extra_path = maybe_download(EXTRA, work_dir)

Found and verified: train.tar.gz 404141560 bytes
Found and verified: test.tar.gz 276555967 bytes
Found and verified: extra.tar.gz 1955489752 bytes
SVHN/test.tar.gz


In [13]:
def maybe_extract(filename):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]
    if os.path.isdir(root):
        print('%s already present - Skipping extraction of %s' %(root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' %root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall(work_dir)
        print('Extracting done!')
        tar.close
    return root
train_dir = maybe_extract(train_path)
test_dir = maybe_extract(test_path)

SVHN/train already present - Skipping extraction of SVHN/train.tar.gz
SVHN/test already present - Skipping extraction of SVHN/test.tar.gz


In [26]:
class ExampleReader(object):
    def __init__(self, path_to_image_files):
        self._path_to_image_files = path_to_image_files
        self._num_examples = len(self._path_to_image_files)
        self._example_pointer = 0

    @staticmethod
    def _get_attrs(digit_struct_mat_file, index):
        """
        Returns a dictionary which contains keys: label, left, top, width and height, each key has multiple values.
        """
        attrs = {}
        f = digit_struct_mat_file
        item = f['digitStruct']['bbox'][index].item()
        for key in ['label', 'left', 'top', 'width', 'height']:
            attr = f[item][key]
            values = [f[attr.value[i].item()].value[0][0]
                      for i in range(len(attr))] if len(attr) > 1 else [attr.value[0][0]]
            attrs[key] = values
        return attrs

    @staticmethod
    def _preprocess(image, bbox_left, bbox_top, bbox_width, bbox_height):
        cropped_left, cropped_top, cropped_width, cropped_height = (int(round(bbox_left - 0.15 * bbox_width)),
                                                                    int(round(bbox_top - 0.15 * bbox_height)),
                                                                    int(round(bbox_width * 1.3)),
                                                                    int(round(bbox_height * 1.3)))
        image = image.crop([cropped_left, cropped_top, cropped_left + cropped_width, cropped_top + cropped_height])
        image = image.resize([64, 64])
        return image

    @staticmethod
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

    @staticmethod
    def _float_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    @staticmethod
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

    def read_and_convert(self, digit_struct_mat_file):
        """
        Read and convert to example, returns None if no data is available.
        """
#         dbg.set_trace()
        if self._example_pointer == self._num_examples:
            return None
        path_to_image_file = self._path_to_image_files[self._example_pointer]
        index = int(path_to_image_file.split('/')[-1].split('.')[0]) - 1
        self._example_pointer += 1

        attrs = ExampleReader._get_attrs(digit_struct_mat_file, index)
        label_of_digits = attrs['label']
        length = len(label_of_digits)
        if length > 5:
            # skip this example
            return self.read_and_convert(digit_struct_mat_file)

        digits = [10, 10, 10, 10, 10]   # digit 10 represents no digit
        for idx, label_of_digit in enumerate(label_of_digits):
            digits[idx] = int(label_of_digit if label_of_digit != 10 else 0)    # label 10 is essentially digit zero

        attrs_left, attrs_top, attrs_width, attrs_height = map(lambda x: [int(i) for i in x], [attrs['left'], attrs['top'], attrs['width'], attrs['height']])
        min_left, min_top, max_right, max_bottom = (min(attrs_left),
                                                    min(attrs_top),
                                                    max(map(lambda x, y: x + y, attrs_left, attrs_width)),
                                                    max(map(lambda x, y: x + y, attrs_top, attrs_height)))
        center_x, center_y, max_side = ((min_left + max_right) / 2.0,
                                        (min_top + max_bottom) / 2.0,
                                        max(max_right - min_left, max_bottom - min_top))
        bbox_left, bbox_top, bbox_width, bbox_height = (center_x - max_side / 2.0,
                                                        center_y - max_side / 2.0,
                                                        max_side,
                                                        max_side)
        image = np.array(ExampleReader._preprocess(Image.open(path_to_image_file), bbox_left, bbox_top, bbox_width, bbox_height)).tobytes()

        example = tf.train.Example(features=tf.train.Features(feature={
            'image': ExampleReader._bytes_feature(image),
            'length': ExampleReader._int64_feature(length),
            'digits': tf.train.Feature(int64_list=tf.train.Int64List(value=digits))
        }))
        return example

In [18]:
def convert_to_tfrecords(dataset_and_digit_struct, tfrecords, writer_callback):
    num_examples = []
    writers = []
    
    for record in tfrecords:
        num_examples.append(0)
        writers.append(tf.python_io.TFRecordWriter(record))
    for dataset, digit_struct in dataset_and_digit_struct:
        image_files = tf.gfile.Glob(os.path.join(dataset, '*.png'))
        total_files = len(image_files)
        print('%d files found in %s' %(total_files,dataset))
#         dbg.set_trace()
        with h5py.File(digit_struct,'r') as f:
            example_reader = ExampleReader(image_files)
            for index, image_file in enumerate(image_files):
                if index%10 == 0:
                    print('(%d/%d) processing %s' % (index+1, total_files, image_file))
                example = example_reader.read_and_convert(f)
                if example is None:
                    break
                idx = writer_callback(tfrecords)
                writers[idx].write(example.SerializeToString())
                num_examples[idx] += 1
    for writer in writers:
        writer.close()
    return num_examples

In [None]:
def create_tfrecords_meta_file(num_train_examples, num_val_examples, num_test_examples,
                               path_to_tfrecords_meta_file):
    print 'Saving meta file to %s...' % path_to_tfrecords_meta_file
    meta = Meta()
    meta.num_train_examples = num_train_examples
    meta.num_val_examples = num_val_examples
    meta.num_test_examples = num_test_examples
    meta.save(path_to_tfrecords_meta_file)

In [28]:
train_struct_mat = os.path.join(train_dir, 'digitStruct.mat')
test_struct_mat = os.path.join(test_dir, 'digitStruct.mat')
train_tfrecords = os.path.join(work_dir, 'train.tfrecords')
valid_tfrecords = os.path.join(work_dir, 'valid.tfrecords')
test_tfrecords = os.path.join(work_dir, 'test.tfrecords')
# First assume the tfrecords is not existed yet.
if not os.path.exists(train_tfrecords):
    print ('Processing training and validation data...')
    [num_train_examples, num_val_examples] = convert_to_tfrecords([(train_dir,train_struct_mat)],[train_tfrecords, valid_tfrecords], lambda paths:0 if random.random()>0.1 else 1)
else: 
    print('The file %s already exists'% train_tfrecords)
print (num_train_examples, num_val_examples)
if not os.path.exists(test_tfrecords):
    print('Processing testing data...')
    [num_test_examples] = convert_to_tfrecords([(test_dir,test_struct_mat)],[test_tfrecords],lambda paths: 0)
else:
    print('The file %s already exists'% test_tfrecords)
print(num_test_examples)
    

The file SVHN/train.tfrecords already exists
29964 3437
The file SVHN/test.tfrecords already exists
13068


In [34]:
metadata = {
    'num_train_examples' : num_train_examples,
    'num_val_examples' : num_val_examples,
    'num_test_examples' : num_test_examples
}
# root = '.'

def maybe_pickle(dataset,dest_dir,filename, force= False):
    """
    pickle the dataset as the pickle file
    
    Args:
         dataset: the dataset need to pickle.
         dest_dir: path where you save the pickle files.
         filename: str represents the name of the dataset
    Return:
         dataset_names: the name of the pickle file
    """

    file_path = os.path.join(dest_dir, filename) + '.pickle'
    if os.path.exists(file_path) and not force:
        print('%s already present - Skipping pickling.' % filename)
    else:
        print('Pickling %s.' % file_path)
        try:
            with open(file_path, 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', filename, ':', e)
    return file_path

meta_pickle = maybe_pickle(metadata, work_dir, 'metadata' )


metadata already present - Skipping pickling.


In [10]:
class A(object):
    def foo(self,x):
        print("executing foo(%s, %s)"%(self,x)) #self --> __main__.A object at...
    @classmethod
    def class_foo(cls,x): #cls --> __main__.A
        print("executing class_foo(%s,%s)"%(cls,x))
    
    @staticmethod
    def static_foo(x):
        print("executing static_foo(%s)"%x)
a=A()
a.foo(1)
"""With classmethods, the class of the object instance is
   implicityly passed as the first argument instead of self.
   If you define something to be a classmethod it is probably
   because you intend to call it from the class rather than
   from a class instance."""
a.class_foo(1)
A.class_foo(2)
# A.foo(2)
a.static_foo(1)
A.static_foo('hi')
print(a.foo)
print(a.class_foo)
print(A.class_foo)
print(a.static_foo)
print(A.static_foo)

"""classmethod must have a reference to a class object as the first parameter,
   whereas staticmethod can have no parameters at all."""

executing foo(<__main__.A object at 0x121f3add8>, 1)
executing class_foo(<class '__main__.A'>,1)
executing class_foo(<class '__main__.A'>,2)
executing static_foo(1)
executing static_foo(hi)
<bound method A.foo of <__main__.A object at 0x121f3add8>>
<bound method A.class_foo of <class '__main__.A'>>
<bound method A.class_foo of <class '__main__.A'>>
<function A.static_foo at 0x121f98ea0>
<function A.static_foo at 0x121f98ea0>


In [55]:
with h5py.File('SVHN/train/digitStruct.mat', 'r') as digit_struct_mat_file:
    item = digit_struct_mat_file['digitStruct']['bbox'][0].item()
    label = digit_struct_mat_file[item]['label']
    print(digit_struct_mat_file[label[1].item()][0][0])
#     for i in digit_struct_mat_file[item].__iter__():
#         print(i)

9.0
