# Feed Dataset through Queue to Tensorflow from HDFS

## Populate HDFS with Sample Dataset

In [None]:
%%bash

hadoop fs -copyFromLocal /root/datasets/linear /

In [None]:
%%bash

hadoop fs -ls /linear

## Feed TensorFlow from HDFS through a `FIFOQueue`
`tf.train.string_input_producer()` uses `tf.FIFOQueue` internally.

In [None]:
import tensorflow as tf
import numpy as np

hdfs_queue = tf.train.string_input_producer([
    "hdfs://127.0.0.1:39000/linear/training.csv",
])

reader = tf.TextLineReader()
_, value = reader.read(hdfs_queue)
x_observed, y_observed = tf.decode_csv(value, [[0.0],[0.0]])

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord,
                                           sess=sess)
    try:
        n = 10
        print("First %d:\n" % n)
        for _ in range(n):
            x, y = sess.run([x_observed, y_observed])
            print(np.asarray([x, y]))
    finally:
        coord.request_stop()
        coord.join(threads)

## Feed Batched Data from HDFS to TensorFlow
`tf.train.batch` uses `tf.FIFOQueue` internally - similar to `tf.train.string_input_producer()`

`capacity` must be larger than `min_after_dequeue`.  The difference in size becomes the prefetch maximum.

`capacity` = `batch_size` * `num_threads`

In [None]:
import tensorflow as tf

hdfs_queue = tf.train.string_input_producer([
    "hdfs://127.0.0.1:39000/linear/training.csv",
])

reader = tf.TextLineReader()
_, value = reader.read(hdfs_queue)
x_observed, y_observed = tf.decode_csv(value, [[0.0],[0.0]])

x_observed_batch, y_observed_batch = \
    tf.train.batch([x_observed, y_observed], 
                            batch_size=10,
                            capacity=80,
                            num_threads=8)
    
with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord,
                                           sess=sess)
    try:
        print("Batch:\n")
        x, y = sess.run([x_observed_batch, y_observed_batch])
        print(x, y)
    finally:
        coord.request_stop()
        coord.join(threads)

## Feed Shuffled Batch Data from HDFS to TensorFlow
`tf.train.shuffle_batch` uses `tf.RandomShuffleQueue` internally.

`min_after_dequeue` defines the buffer size when randomly sampling.  Larger buffers require more RAM, but provide better shuffling characteristics.

`capacity` must be larger than `min_after_dequeue`.  The difference in size becomes the prefetch maximum.

`capacity` = `batch_size` * (`num_threads` + `some_safety_margin`) + `min_after_dequeue`

In [None]:
import tensorflow as tf

shuffled_hdfs_queue = tf.train.string_input_producer([
    "hdfs://127.0.0.1:39000/linear/training.csv",
])

reader = tf.TextLineReader()
_, value = reader.read(shuffled_hdfs_queue)
x_observed, y_observed = tf.decode_csv(value, [[0.0],[0.0]])

x_observed_shuffled_batch, y_observed_shuffled_batch = \
    tf.train.shuffle_batch([x_observed, y_observed], 
                            batch_size=10,
                            capacity=100,
                            min_after_dequeue=10,
                            num_threads=8)

with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord,
                                           sess=sess)
    try:
        print("Shuffled batch:\n")
        x, y = sess.run([x_observed_shuffled_batch, y_observed_shuffled_batch])
        print(x, y)
    finally:
        coord.request_stop()
        coord.join(threads)