# Reading Writing Converting Examples

provides a list of small example for writing and reading data in tensorflow from different sources (numpy array, csv file ...)

More in the official [documentation](https://www.tensorflow.org/how_tos/reading_data/)

Also a good [introduction](https://warmspringwinds.github.io/tensorflow/tf-slim/2016/12/21/tfrecords-guide/)

In [3]:
import argparse, os
import tensorflow as tf
import numpy as np
from pprint import pprint 

FLAGS = None

parser = argparse.ArgumentParser()
parser.add_argument(
    '--directory',
    type=str,
    default='data',
    help='Directory to download data files and write the converted result'
)

FLAGS, unparsed = parser.parse_known_args()


## Standard Tensorflow Format



In [17]:
def _int64_feature(value, verbose=False):
    val = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    if verbose:
        print(value, val)
    return val


def _bytes_feature(value, verbose=False):
    val = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    if verbose:
        print(value, val)
    return val

fmat = lambda x: (np.random.rand(x) > 0.5).astype(np.uint8)

list_of_data = [fmat(10) for i in range(20)]
list_of_labels = [0 if np.random.rand() > 0.5 else 1 for i in range(20)]

#pprint(list_of_data)
#pprint(list_of_labels)

Writing simple list of matrix
-----------------------------
Adapted from this [repo](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/how_tos/reading_data/convert_to_records.py)

In [18]:
verbose = False

num_examples = len(list_of_data)

assert(len(list_of_data) == len(list_of_labels))

size = list_of_data[0].shape[0]

filename = os.path.join(FLAGS.directory, 'simple_list_of_matrix.tfrecords')
print('Writing', filename)

writer = tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
    data_raw = list_of_data[index].tostring()
    example = tf.train.Example(features=tf.train.Features(feature={
            'label': _int64_feature(int(list_of_labels[index]), verbose),
            'mat_raw': _bytes_feature(data_raw, verbose)}))
    writer.write(example.SerializeToString())
writer.close()

Writing data/simple_list_of_matrix.tfrecords


Reading simple list of matrix
-----------------------------
Adapted from this [repo](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py)

In [19]:
reconstructed_matrix = []

record_iterator = tf.python_io.tf_record_iterator(path=filename)

for record in record_iterator:
    example = tf.train.Example()
    example.ParseFromString(record)
    label = int(example.features.feature['label'].int64_list.value[0])
    mat_string = (example.features.feature['mat_raw'].bytes_list.value[0])
    mat_1d = np.fromstring(mat_string, dtype=np.uint8)
    # if reshape needed
    #reconstructed_mat = mat_1d.reshape((height, width, -1))
    reconstructed_matrix.append((mat_1d, label))

In [20]:
for i in range(len(list_of_data)):
    original_mat = list_of_data[i]
    original_lab = list_of_labels[i]
    reconstructed_mat = reconstructed_matrix[i][0]
    reconstructed_lab = reconstructed_matrix[i][1]
    print(np.allclose(original_mat, reconstructed_mat), np.allclose(original_lab, reconstructed_lab))

True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True
True True


## Batch of Records

In [24]:
def read_and_decode(filename_queue):
    
    reader = tf.TFRecordReader()

    _, serialized_example = reader.read(filename_queue)

    features = tf.parse_single_example(serialized_example,
                                        # Defaults are not specified since both keys are required.
                                        features={
                                            'label': tf.FixedLenFeature([], tf.int64),
                                            'mat_raw': tf.FixedLenFeature([], tf.string),
                                        })

    # Convert from a scalar string tensor (whose single string has
    # length mnist.IMAGE_PIXELS) to a uint8 tensor with shape
    # [mnist.IMAGE_PIXELS].
    mat = tf.decode_raw(features['mat_raw'], tf.uint8)
    
    label = tf.cast(features['label'], tf.int64)
    
    mat_shape = tf.stack([10])
    label_shape = tf.stack([1])
    
    mat = tf.reshape(mat, mat_shape)
    label = tf.reshape(label, label_shape)
    
    mat_size_const = tf.constant((10,), dtype=tf.int32)
    label_const = tf.constant((1,), dtype=tf.int32)
    
    # randomize batch
    mat, label = tf.train.shuffle_batch([mat, label],
                                         batch_size=2,
                                         capacity=30,
                                         num_threads=2,
                                         min_after_dequeue=10)
    
    return mat, label


In [26]:
filename_queue = tf.train.string_input_producer([filename], num_epochs=10)

# Even when reading in multiple threads, share the filename
# queue.
matrix, label = read_and_decode(filename_queue)

# The op for initializing the variables.
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

with tf.Session()  as sess:
    
    sess.run(init_op)
    
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    
    # Let's read off 3 batches just for example
    for i in range(3):
    
        mat, lab = sess.run([matrix, label])
        pprint("---- {}".format(i))
        pprint(mat[0])
        pprint(lab[0])
        
        pprint(mat[1])
        pprint(lab[1])
        
        print('current batch')
                
    
    coord.request_stop()
    coord.join(threads)



'---- 0'
array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1], dtype=uint8)
array([1])
array([1, 1, 1, 0, 0, 0, 0, 1, 1, 0], dtype=uint8)
array([1])
current batch
'---- 1'
array([1, 0, 1, 1, 0, 0, 1, 0, 1, 1], dtype=uint8)
array([0])
array([1, 0, 0, 0, 0, 0, 1, 0, 1, 1], dtype=uint8)
array([0])
current batch
'---- 2'
array([1, 0, 0, 0, 1, 0, 1, 0, 1, 1], dtype=uint8)
array([1])
array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0], dtype=uint8)
array([1])
current batch


Notes on tf.train.shuffle_batch
----
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard2/tf.train.shuffle_batch.md