# TF Dataset API

Adapted from: https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html

In [None]:
# Windows users: You only need to change PATH, rest is platform independent
PATH = "/tmp/tf_dataset_and_estimator_apis"

In [None]:
import os
import shutil
import tensorflow as tf
assert "1.4" <= tf.__version__, "TensorFlow r1.4 or later is needed"

In [None]:
PATH_DATASET = os.path.join(PATH, "dataset")
FILE_TRAIN = os.path.join(PATH_DATASET, "iris_training.csv")
FILE_TEST = os.path.join(PATH_DATASET, "iris_test.csv")

URL_TRAIN = "http://download.tensorflow.org/data/iris_training.csv"
URL_TEST = "http://download.tensorflow.org/data/iris_test.csv"

In [None]:
import six.moves.urllib.request as request

In [None]:
def downloadDataset(url, file):
    if not os.path.exists(PATH_DATASET):
        os.makedirs(PATH_DATASET)
    if not os.path.exists(file):
        data = request.urlopen(url).read()
        with open(file, "wb") as f:
            f.write(data)
            f.close()

In [None]:
downloadDataset(URL_TRAIN, FILE_TRAIN)
downloadDataset(URL_TEST, FILE_TEST)

The CSV features in our training & test data

In [None]:
feature_names = [
    'SepalLength',
    'SepalWidth',
    'PetalLength',
    'PetalWidth']

Create an input function reading a file using the Dataset API. Then provide the results to the Estimator API.

Estimators requires that you create a function of the following format:

```python
def input_fn():
    ...<code>...
    return ({ 'SepalLength':[values], ..<etc>.., 'PetalWidth':[values] },
            [IrisFlowerType])
```

The return value must be a two-element tuple organized as follows:

The first element must be a dict in which each input feature is a key, and then a list of values for the training batch. The second element is a list of labels for the training batch.

Notice how the function to parse the csv file is defined inside `my_input_fn`. 

In [None]:
def my_input_fn(file_path, perform_shuffle=False, repeat_count=1):
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, [[0.], [0.], [0.], [0.], [0]])
        label = parsed_line[-1]  # Last element is the label
        del parsed_line[-1]  # Delete last element
        features = parsed_line  # Everything but last elements are the features
        d = dict(zip(feature_names, features)), label
        return d

    dataset = (tf.data.TextLineDataset(file_path)  # Read text file
               .skip(1)  # Skip header row
               .map(decode_csv))  # Transform each elem by applying decode_csv fn
    if perform_shuffle:
        # Randomizes input using a window of 256 elements (read into memory)
        dataset = dataset.shuffle(buffer_size=256)
    dataset = dataset.repeat(repeat_count)  # Repeats dataset this # times
    dataset = dataset.batch(32)  # Batch size to use
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

In [None]:
next_batch = my_input_fn(FILE_TRAIN, True)  # Will return 32 random elements

Now let's try it out, retrieving and printing one batch of data.# Although this code looks strange, you don't need to understand the details.

In [None]:
with tf.Session() as sess:
    first_batch = sess.run(next_batch)

print(first_batch)

Create the feature_columns, which specifies the input to our model. All our input features are numeric, so use numeric_column for each one

In [None]:
feature_columns = [tf.feature_column.numeric_column(k) for k in feature_names]

Create a deep neural network regression classifier. Use the DNNClassifier pre-made estimator

In [None]:
PATH_MODEL = os.path.join(PATH, 'model')

if os.path.exists(PATH_MODEL):
    shutil.rmtree(PATH_MODEL)

os.makedirs(PATH_MODEL)

In [None]:
model = tf.estimator.DNNClassifier(
    feature_columns=feature_columns,  # The input features to our model
    hidden_units=[10, 10],  # Two layers, each with 10 neurons
    n_classes=3,
    model_dir=PATH_MODEL)  # Path to where checkpoints etc are stored

## Exercise 1

Train the model using the previous function `my_input_fn`.

Input to training is a file with training example.

Stop training after 8 iterations of train data (epochs)

You will need to define a new input function with no arguments, as explained in the `model.train` documentation:


```python
"""
Signature: model.train(input_fn, hooks=None, steps=None, max_steps=None, saving_listeners=None)
Docstring:
Trains a model given training data input_fn.

Args:
  input_fn: Input function returning a tuple of:
      features - `Tensor` or dictionary of string feature name to `Tensor`.
      labels - `Tensor` or dictionary of `Tensor` with labels.
"""
```


In [None]:
# your code here
# model.train(...)

## Exercise 2

Evaluate the model using the examples contained in FILE_TEST using `model.evaluate`. Here too you will need to define a new function with no arguments.

In [None]:
# your code here
# evaluate_result = ...

In [None]:
print("Evaluation results")
for key in evaluate_result:
    print("   {}, was: {}".format(key, evaluate_result[key]))

## Exercise 3

Predict the type of some Iris flowers. Let's predict the examples in FILE_TEST, repeat only once.

In [None]:
# your code here
# predict_results = ...

In [None]:
print("Predictions on test file")
for prediction in predict_results:
    # Will print the predicted class, i.e: 0, 1, or 2 if the prediction
    # is Iris Sentosa, Vericolor, Virginica, respectively.
    print(prediction["class_ids"][0])

## Reading from data in memory

Let create a dataset for prediction. We've taken the first 3 examples in FILE_TEST

In [None]:
prediction_input = [[5.9, 3.0, 4.2, 1.5],  # -> 1, Iris Versicolor
                    [6.9, 3.1, 5.4, 2.1],  # -> 2, Iris Virginica
                    [5.1, 3.3, 1.7, 0.5]]  # -> 0, Iris Sentosa

## Exercise 4

Complete the function `new_input_fn()` by completing the inner `decode` function:

```python
def new_input_fn():
    def decode(x):
        # your code here
        return # make sure you return data in the correct form

    dataset = tf.data.Dataset.from_tensor_slices(prediction_input)
    dataset = dataset.map(decode)
    iterator = dataset.make_one_shot_iterator()
    next_feature_batch = iterator.get_next()
    return next_feature_batch, None  # In prediction, we have no labels
```
    
    

Predict all our prediction_input

In [None]:
predict_results = model.predict(input_fn=new_input_fn)

In [None]:
# Print results
print("Predictions:")
for idx, prediction in enumerate(predict_results):
    type = prediction["class_ids"][0]  # Get the predicted class (index)
    if type == 0:
        print("  I think: {}, is Iris Sentosa".format(prediction_input[idx]))
    elif type == 1:
        print("  I think: {}, is Iris Versicolor".format(prediction_input[idx]))
    else:
        print("  I think: {}, is Iris Virginica".format(prediction_input[idx]))

*Copyright &copy; 2017 Francesco Mosconi & CATALIT LLC. All rights reserved.*