In [1]:
import tensorflow as tf
import numpy as np

In [2]:
num_items = 11
num_list1 = np.arange(num_items)
num_list2 = np.arange(num_items, num_items*2)

In [5]:
num_list1_dataset = tf.data.Dataset.from_tensor_slices(num_list1)

In [8]:
iterator = tf.compat.v1.data.make_one_shot_iterator(num_list1_dataset)

In [11]:
for item in num_list1_dataset:
    num = iterator.get_next().numpy()
    print(num)
#Executing the same code twice raises an error.

OutOfRangeError: End of sequence

In [14]:
#Acess a whole batch at a time
num_list1_dataset = tf.data.Dataset.from_tensor_slices(num_list1).batch(3, drop_remainder =False)
iterator = tf.compat.v1.data.make_one_shot_iterator(num_list1_dataset)
for item in num_list1_dataset:
    num = iterator.get_next().numpy()
    print(num)

#Drop remainder assures that only batches of exactly batch size are passing through

[0 1 2]
[3 4 5]
[6 7 8]
[ 9 10]


In [15]:
#To present features and labels together
dataset1 = [1,2,3, 4, 5]
dataset2 = list('aeiou')
dataset1 = tf.data.Dataset.from_tensor_slices(dataset1)
dataset2 = tf.data.Dataset.from_tensor_slices(dataset2)
zipped_datasets = tf.data.Dataset.zip((dataset1, dataset2))
iterator = tf.compat.v1.data.make_one_shot_iterator(zipped_datasets)
for item in zipped_datasets:
    #since we aren't dealing with numpy arrays ?! why is there no .numpy()
    num = iterator.get_next()
    print(num)

['a', 'e', 'i', 'o', 'u']


In [None]:
ds1 = tf.data.Dataset.from_tensor_slices

## Concatenate two datasets

In [18]:
ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3, 5, 7, 11, 13, 17])
ds2 = tf.data.Dataset.from_tensor_slices([19, 23, 29, 31, 37, 41])
ds3 = ds1.concatenate(ds2)
print(ds3)
iterator = tf.compat.v1.data.make_one_shot_iterator(ds3)
for i in range(14):
    num = iterator.get_next().numpy()
    print(num)

<ConcatenateDataset shapes: (), types: tf.int32>
1
2
3
5
7
11
13
17
19
23
29
31
37
41


## Using CSV files in datasets

In [21]:
filename = ["./size_1000.csv"]
record_defaults = [tf.float32]*2
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header=True, select_cols = [1, 2])
for item in dataset:
    print(item)


 <tf.Tensor: shape=(), dtype=float32, numpy=-9.620903>)
(<tf.Tensor: shape=(), dtype=float32, numpy=17.725624>, <tf.Tensor: shape=(), dtype=float32, numpy=-2.7907386>)
(<tf.Tensor: shape=(), dtype=float32, numpy=11.908596>, <tf.Tensor: shape=(), dtype=float32, numpy=-7.098449>)
(<tf.Tensor: shape=(), dtype=float32, numpy=14.175741>, <tf.Tensor: shape=(), dtype=float32, numpy=2.6668468>)
(<tf.Tensor: shape=(), dtype=float32, numpy=1.788308>, <tf.Tensor: shape=(), dtype=float32, numpy=-6.0094914>)
(<tf.Tensor: shape=(), dtype=float32, numpy=15.963848>, <tf.Tensor: shape=(), dtype=float32, numpy=-4.835417>)
(<tf.Tensor: shape=(), dtype=float32, numpy=-5.4839935>, <tf.Tensor: shape=(), dtype=float32, numpy=5.4366775>)
(<tf.Tensor: shape=(), dtype=float32, numpy=-4.657263>, <tf.Tensor: shape=(), dtype=float32, numpy=-4.5396028>)
(<tf.Tensor: shape=(), dtype=float32, numpy=16.127203>, <tf.Tensor: shape=(), dtype=float32, numpy=7.793842>)
(<tf.Tensor: shape=(), dtype=float32, numpy=2.0148365>

AttributeError: 'tuple' object has no attribute 'numpy'

In [2]:
filename = "mycsvfile.txt"
record_defaults = [tf.float32, tf.constant([0.0], dtype=tf.float32), tf.int32]
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header=False, select_cols = [1, 2, 3])
for item in dataset:
    print(item)
#Record_deafults sets the default value and type of elements in the corresponding tensors

(<tf.Tensor: shape=(), dtype=float32, numpy=428000.0>, <tf.Tensor: shape=(), dtype=float32, numpy=555.0>, <tf.Tensor: shape=(), dtype=int32, numpy=42>)
(<tf.Tensor: shape=(), dtype=float32, numpy=-5.3>, <tf.Tensor: shape=(), dtype=float32, numpy=0.0>, <tf.Tensor: shape=(), dtype=int32, numpy=69>)


In [5]:
filename = 'file1.txt'
record_defaults = [tf.float32, tf.float32, tf.string]
dataset = tf.data.experimental.CsvDataset(filename, record_defaults, header = False)
for item in dataset:
    print(item[0].numpy(), item[1].numpy(), item[2].numpy().decode())

12.6 23.4  Abc.co.uk
98.7 56.8  Xyz.com
34.2 68.1  Pqr.net


# TFRecords

Structure of a TFRecord file must be specified before saving for it to be properly written and read back.
Store each sample of the data in a tf.train.Example or tf.train.SequenceExample then serialize it using tf.python_io.TFRecordWriter before saving it to the disk.

In [4]:
data = np.array([10., 11, 12, 13, 14, 15])

def np_to_tfrecords(fname, data):
    writer = tf.io.TFRecordWriter(fname)
    feature = {}
    feature['data'] = tf.train.Feature(float_list = tf.train.FloatList(value=data))
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    serialized = example.SerializeToString()
    writer.write(serialized)
    writer.close()

np_to_tfrecords("./myfile.tfrecords", data)



In [5]:
dataset = tf.data.TFRecordDataset("./myfile.tfrecords")

def parse_function(example_proto):
    keys_to_features = {'data':tf.io.FixedLenSequenceFeature([], dtype = tf.float32, allow_missing = True)}
    parsed_features = tf.io.parse_single_example(serialized = example_proto, features = keys_to_features)
    return parsed_features['data']

dataset = dataset.map(parse_function)
iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
item = iterator.get_next()

print(item)
print(item.numpy())
print(item[2].numpy())

tf.Tensor([10. 11. 12. 13. 14. 15.], shape=(6,), dtype=float32)
[10. 11. 12. 13. 14. 15.]
12.0


In [16]:
filename = './students.tfrecords'
data = {
    'ID': 61553,
    'Name': ['Jones', 'Felicity'],
    'Scores': [45.6, 97.2]
}

In [17]:
ID = tf.train.Feature(int64_list = tf.train.Int64List(value=[data['ID']]))
Name = tf.train.Feature(bytes_list = tf.train.BytesList(value = [n.encode('utf-8') for n in data['Name']]))
Scores = tf.train.Feature(float_list = tf.train.FloatList(value=data['Scores']))
example = tf.train.Example(features=tf.train.Features(feature={'ID':ID, 'Name':Name, 'Scores':Scores}))

In [18]:
writer = tf.io.TFRecordWriter(filename)
writer.write(example.SerializeToString())
writer.close()

In [29]:
dataset = tf.data.TFRecordDataset("./students.tfrecords")

def parse_function(example_proto):
    keys_to_features = {'ID':tf.io.FixedLenFeature([], dtype=tf.int64), 'Name':tf.io.VarLenFeature(dtype=tf.string), 'Scores':tf.io.VarLenFeature(dtype=tf.float32)}
    parsed_features = tf.io.parse_single_example(serialized=example_proto, features=keys_to_features)
    return parsed_features["ID"], parsed_features["Name"], parsed_features["Scores"]

datast = dataset.map(parse_function)

iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
item = iterator.get_next()
print(item)


tf.Tensor(b'\nD\n\r\n\x02ID\x12\x07\x1a\x05\n\x03\xf1\xe0\x03\n\x1b\n\x04Name\x12\x13\n\x11\n\x05Jones\n\x08Felicity\n\x16\n\x06Scores\x12\x0c\x12\n\n\x08ff6Bff\xc2B', shape=(), dtype=string)


In [25]:
print("ID: ", item[0].numpy())
name = item[1].values.numpy()
name1 = name[0].decode('utf-8')
name2 = name[1].decode('utf-8')
print('Name: ', name1, ", ", name2)
print("Scores: ", item[2].values.numpy())

InvalidArgumentError: Index out of range using input dim 0; input has only 0 dims [Op:StridedSlice] name: strided_slice/

# One Hot Encoding

In [32]:
y = 5
y_train_ohe = tf.one_hot(y, depth=10).numpy()
print(y, " is ", y_train_ohe, " when one hot encoded.")

5  is  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]  when one hot encoded.


In [37]:
from tensorflow.python.keras.datasets import fashion_mnist

width, height = 28, 28
n_classes = 10

(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
split = 50000
(y_train, y_valid) = y_train[:split], y_train[split:]

y_train_ohe = tf.one_hot(y_train, depth = n_classes).numpy()
y_valid_ohe = tf.one_hot(y_valid, depth = n_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth = n_classes).numpy()

i = 5
print(y_train[i])
#i=5 is 2
print(y_train_ohe[i])

2
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
