In [None]:
import numpy as np
import tensorflow as tf


### data.Dataset Examples with numpy arrays

In [None]:
# numpy array example
num_items = 11
num_list1 = np.arange(num_items)
num_list2 = np.arange(num_items,num_items*2)

In [None]:
num_list1_dataset = tf.data.Dataset.from_tensor_slices(num_list1)
num_list2_dataset = tf.data.Dataset.from_tensor_slices(num_list2)

In [None]:
iterator1 = tf.compat.v1.data.make_one_shot_iterator(num_list1_dataset)
iterator2 = tf.compat.v1.data.make_one_shot_iterator(num_list2_dataset)

In [None]:
# note that running this cell a second time without restarting the kernel gives
# an 'OutOfRangeError: End of sequence [Op:IteratorGetNextSync]' error
# since we are using make_one_shot_iterator()
for item in num_list1_dataset:
    num = iterator1.get_next().numpy()
    print(num)

In [None]:
for item in num_list2_dataset:
    num = iterator2.get_next().numpy()
    print(num)

In [None]:
# numpy array in batches example, drop_remainder=False is the default
num_list1_dataset = tf.data.Dataset.from_tensor_slices(num_list1).batch(3, drop_remainder = False)
iterator = tf.compat.v1.data.make_one_shot_iterator(num_list1_dataset)

In [None]:
for item in num_list1_dataset:
    num = iterator.get_next().numpy()
    print(num)

In [None]:
#zipping datasets examples

In [None]:
num_list1_dataset = tf.data.Dataset.from_tensor_slices(num_list1)
num_list2_dataset = tf.data.Dataset.from_tensor_slices(num_list2)
zipped_datasets = tf.data.Dataset.zip((num_list1_dataset, num_list2_dataset))

In [None]:
dataset1 = [1,2,3,4,5]
dataset2 = ['a','e','i','o','u']
dataset1 = tf.data.Dataset.from_tensor_slices(dataset1)
dataset2 = tf.data.Dataset.from_tensor_slices(dataset2)
zipped_datasets = tf.data.Dataset.zip((dataset1, dataset2))

iterator = tf.compat.v1.data.make_one_shot_iterator(zipped_datasets)
for item in zipped_datasets:
    num = iterator.get_next()
    print(num)

In [None]:
# concatenate datasets example

In [None]:
ds1 = tf.data.Dataset.from_tensor_slices([1,2,3,5,7,11,13,17])
ds2 = tf.data.Dataset.from_tensor_slices([19,23,29,31,37,41])
ds3 = ds1.concatenate(ds2)
print(ds3)
iterator = tf.compat.v1.data.make_one_shot_iterator(ds3)
for i in range(14):
    num = iterator.get_next()
    print(num)

In [None]:
# in fact, we don't even need an iterator
# this for works just as well, and throws no OutOfRangeError
# when used repeatedly
epochs=2
for e in range(epochs):
    for item in ds3:
        print(item)


### use of  comma separated files

In [None]:
import tensorflow as tf

filename = ["./size_1000.csv"]
record_defaults = [tf.float32] * 2   # two required float columns
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header=True, select_cols=[1,2])

In [None]:
for item in dataset:
    print(item)

In [None]:
# more examples of csv files, see files for structures

In [None]:
filename = "mycsvfile.txt"
record_defaults = [tf.float32, tf.constant([0.0], dtype=tf.float32), tf.int32,]
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header=False, select_cols=[1,2,3])

for item in dataset:
    print(item)

In [None]:
filename = "file1.txt"
record_defaults = [tf.float32, tf.float32, tf.string ,]
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header=False)
for item in dataset:
    print(item[0].numpy(), item[1].numpy(),item[2].numpy().decode() ) # decode as string is in binary format.

## Another popular storage format is the TFRecord

### Example 1

In [None]:
import numpy as np
import tensorflow as tf

data=np.array([10.,11.,12.,13.,14.,15.])
def npy_to_tfrecords(fname,data):
    writer = tf.io.TFRecordWriter(fname)
    feature={}

    feature['data'] = tf.train.Feature(float_list=tf.train.FloatList(value=data))
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    serialized = example.SerializeToString()
    writer.write(serialized)
    writer.close()
npy_to_tfrecords("./myfile.tfrecords",data)

In [None]:
dataset = tf.data.TFRecordDataset("./myfile.tfrecords")

def parse_function(example_proto):
    keys_to_features = {'data':tf.io.FixedLenSequenceFeature([], dtype = tf.float32, allow_missing = True) }
    parsed_features = tf.io.parse_single_example(serialized=example_proto, features=keys_to_features)
    return parsed_features['data']

dataset = dataset.map(parse_function)
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
# array is retrieved as one item
item = iterator.get_next()
print(item)
print(item.numpy())
print(item[2].numpy())

### Example 2

In [None]:

# create record

filename = './students.tfrecords'
data = {
    'ID': 61553,
    'Name': ['Jones', 'Felicity'],
    'Scores': [45.6, 97.2] }

In [None]:
ID = tf.train.Feature(int64_list=tf.train.Int64List(value=[data['ID']]))

Name = tf.train.Feature(bytes_list=tf.train.BytesList(value=[n.encode('utf-8') for n in data['Name']]))

Scores = tf.train.Feature(float_list=tf.train.FloatList(value=data['Scores']))

example = tf.train.Example(features=tf.train.Features(feature={'ID': ID, 'Name': Name, 'Scores': Scores }))


In [None]:
writer = tf.io.TFRecordWriter(filename)
writer.write(example.SerializeToString())
writer.close()

In [None]:
# read record
dataset = tf.data.TFRecordDataset("./students.tfrecords")

def parse_function(example_proto):
    keys_to_features = {'ID':tf.io.FixedLenFeature([], dtype = tf.int64),
                       'Name':tf.io.VarLenFeature(dtype = tf.string),
                        'Scores':tf.io.VarLenFeature(dtype = tf.float32)
                       }
    parsed_features = tf.io.parse_single_example(serialized=example_proto, features=keys_to_features)
    return parsed_features["ID"], parsed_features["Name"],parsed_features["Scores"]

In [None]:
dataset = dataset.map(parse_function)

iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
item = iterator.get_next()
# record is retrieved as one item
print(item)

In [None]:
print("ID: ",item[0].numpy())
name = item[1].values.numpy()
name1= name[0].decode()
name2 = name[1].decode()
print("Name:",name1,",",name2)
print("Scores: ",item[2].values.numpy())

### one-hot encoding

In [None]:
# This example uses the fashion-mnist dataset
# Which is a dropin replacement for mnist

In [None]:
import tensorflow as tf
from tensorflow.python.keras.datasets import fashion_mnist

width, height, = 28,28
n_classes = 10
# load the dataset
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# normalise the features  for better training
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.

# flatten the features  for use by the training algorithm
x_train = x_train.reshape((60000, width * height))
x_test = x_test.reshape((10000, width * height))

split = 50000
#split feature training  set into training and validation sets
(x_train, x_valid) = x_train[:split], x_train[split:]
(y_train, y_valid) = y_train[:split], y_train[split:]

# one-hot encode the labels using TensorFLow.
# then convert back to numpy as we cannot combine numpy
# and tensors as input to keras later
y_train_ohe = tf.one_hot(y_train, depth=n_classes).numpy()
y_valid_ohe = tf.one_hot(y_valid, depth=n_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth=n_classes).numpy()
#or use tf.keras.utils.to_categorical(y_train,10), for example
# show difference between original label and one-hot-encoded label
i=5
print(y_train[i]) # 'ordinary' number value of label at index i

print(y_train_ohe[i]) # same value as a 1. in correct position in a length 10 1D numpy array

In [None]:
#one-hot encoding is also useful where the labels are text categorical,
# e.g. red, blue, green could be coded as [0,0,1], [0,1,0] and [1,0,0] for example

In [None]:
# automatic differentiation

In [None]:
# by default, you can only call tape.gradient once in a GradientTape  context
weight1 = tf.Variable(2.0)
def weighted_sum(x1):
    return weight1 * x1
with tf.GradientTape() as tape:
    sum = weighted_sum(7.)
    [weight1_grad] = tape.gradient(sum, [weight1])
print(weight1_grad.numpy()) # 7 , weight1*x diff w.r.t. weight1 is x, 7, also see below.

In [None]:
# if you need to call tape.gradient more than once
# use GradientTape(persistent=True)
weight1 = tf.Variable(2.0)
weight2 = tf.Variable(3.0)
weight3 = tf.Variable(5.0)

def weighted_sum(x1, x2, x3):
    return weight1*x1 + weight2*x2 +  weight3*x3

with tf.GradientTape(persistent=True) as tape:
    sum = weighted_sum(7.,5.,6.)
[weight1_grad] = tape.gradient(sum, [weight1])
[weight2_grad] = tape.gradient(sum, [weight2])
[weight3_grad] = tape.gradient(sum, [weight3])

print(weight1_grad.numpy()) # x1, 7
print(weight2_grad.numpy()) # x2, 5
print(weight3_grad.numpy()) # x3, 6
