# 2.1 CIFAR-10 dataset

## 2.1.1 CIFAR-10 summary
- 3 color channels
- width 32 height 32
- 10 classes

## 2.1.2 Download CIFAR-10 dataset

In [6]:
import cifar10
import tensorflow as tf

import os
import sys
import tarfile
import urllib

In [7]:
FLAGS = tf.app.flags.FLAGS
FLAGS.data_dir = 'cifar10_data/'

In [11]:
# we don't use the function directly
# we try to look into the function in depth
# cifar10.maybe_download_and_extract()

def maybe_download_and_extract():
    # download and extract the tarball from Alex's website
    dest_directory = FLAGS.data_dir
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    
    DATA_URL = 'http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):
        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' 
                             % (filename, float(count * block_size) / float(total_size) * 100.))
            sys.stdout.flush()
        
        filepath, _ = urllib.urlretrieve(DATA_URL, filepath, _progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
    
    extracted_dir_path = os.path.join(dest_directory, 'cifar-10-batches-bin')
    if not os.path.exists(extracted_dir_path):
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)

In [12]:
maybe_download_and_extract()

>> Downloading cifar-10-binary.tar.gz 100.0%()
('Successfully downloaded', 'cifar-10-binary.tar.gz', 170052171, 'bytes.')


## 2.1.3 TensorFlow data reading mechanisim

In [13]:
if not os.path.exists('read'):
    os.makedirs('read/')

In [None]:
with tf.Session() as sess:
    # 3 images to read
    filename = ['A.jpg', 'B.jpg', 'C.jpg']
    
    # string_input_producer will produce a queue of file names
    filename_queue = tf.train.string_input_producer(filename, shuffle=False, num_epochs=5)
    
    # reader read data from file paths
    reader = tf.WholeFileReader()
    key, value = reader.read(filename_queue)
    
    tf.local_variables_initializer().run()
    threads = tf.train.start_queue_runners(sess=sess)
    
    i = 0
    while True:
        i += 1
        image_data = sess.run(value)
        with open('read/test_%d.jpg' % i, 'wb') as f:
            f.write(image_data)
            
# at the end, there would be an OutOfRangeError
# which means the queue is finished

In [18]:
filename_queue

<tensorflow.python.ops.data_flow_ops.FIFOQueue at 0x7f39cc3d6710>

## 2.1.4 Save CIFAR-10 dataset as images

In [15]:
import scipy.misc

In [None]:
def read_cifar10(filename_queue):
    # read and parse examples from CIFAR10 data files
    class CIFAR10Record(object):
        pass
    result = CIFAR10Record()
    
    # dimensions of images in the CIFAR-10 dataset
    label_bytes = 1
    result.height = 32
    result.width = 32
    result.depth = 3
    image_bytes = result.height * result.width * result.depth
    record_bytes = label_bytes + image_bytes
    
    reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
    result.key, value = reader.read(filename_queue)
    
    record_bytes = tf.decode_raw(value, tf.uint8)
    result.label = tf.cast(tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)
    
    depth_major = tf.reshape(
        tf.strided_slice(record_bytes, [label_bytes], [label_bytes + image_bytes]), 
        [result.depth, result.height, result.width])
    result.uint8image = tf.transpose(depth_major, [1, 2, 0])
    
    return result


def inputs_origin(data_dir):
    filenames = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)]
    for f in filenames:
        if not tf.gfile.Exists(f):
            raise ValueError('Failed to find file: ' + f)
    
    filename_queue = tf.train.string_input_producer(filenames)
    read_input = read_cifar10(filename_queue)
    
    reshaped_image = tf.cast(read_input.uint8image, tf.float32)
    
    return reshaped_image


with tf.Session() as sess:
    reshaped_image = inputs_origin('cifar10_data/cifar-10-batches-bin')
    threads = tf.train.start_queue_runners(sess=sess)
    
    sess.run(tf.global_variables_initializer())
    if not os.path.exists('cifar10_data/images/'):
        os.makedirs('cifar10_data/images/')
    
    for i in range(30):
        image_array = sess.run(reshaped_image)
        scipy.misc.toimage(image_array).save('cifar10_data/images/%d.jpg' % i)